Beispiel #1
0
    def test_new_file_and_clobber(self):
        writer = AntiClobberFileWriter(self.get_path_namer())
        session = writer.session()

        request1 = HTTPRequest('http://example.com/my_file.txt')
        response1 = HTTPResponse(status_code=200,
                                 reason='OK',
                                 request=request1)

        session.process_request(request1)
        session.process_response(response1)
        session.save_document(response1)

        self.assertTrue(os.path.exists('my_file.txt'))

        session = writer.session()

        request2 = HTTPRequest('http://example.com/my_file.txt')
        response2 = HTTPResponse(status_code=200,
                                 reason='OK',
                                 request=request2)

        session.process_request(request2)
        session.process_response(response2)
        session.save_document(response2)

        self.assertTrue(os.path.exists('my_file.txt'))
Beispiel #2
0
    def test_trust_server_names(self):
        writer = AntiClobberFileWriter(self.get_path_namer(),
                                       trust_server_names=True)
        session = writer.session()

        request1 = HTTPRequest('http://example.com')
        response1 = HTTPResponse(status_code=302,
                                 reason='Moved',
                                 request=request1)

        session.process_request(request1)
        session.process_response(response1)

        request2 = HTTPRequest('http://example.com/my_file.html')
        response2 = HTTPResponse(status_code=200,
                                 reason='OK',
                                 request=request2)

        session.process_request(request2)
        session.process_response(response2)

        session.save_document(response2)

        print(list(os.walk('.')))
        self.assertTrue(os.path.exists('my_file.html'))
Beispiel #3
0
    def test_null_writer(self):
        writer = NullWriter()
        session = writer.session()

        session.process_request(HTTPRequest())
        session.process_response(HTTPResponse())
        session.discard_document(HTTPResponse())
        session.save_document(HTTPResponse())
        self.assertIsNone(session.extra_resource_path('blah'))
Beispiel #4
0
    def test_file_continue(self):
        writer = OverwriteFileWriter(self.get_path_namer(),
                                     file_continuing=True)
        session = writer.session()

        with open('my_file.txt', 'wb') as file:
            file.write(b'TEST')

        request = HTTPRequest('http://example.com/my_file.txt')
        session.process_request(request)

        self.assertIn('Range', request.fields)

        response = HTTPResponse(status_code=206,
                                reason='Partial content',
                                request=request)
        session.process_response(response)

        response.body.write(b'END')
        response.body.flush()

        session.save_document(response)

        with open('my_file.txt', 'rb') as file:
            data = file.read()

        self.assertEqual(b'TESTEND', data)
Beispiel #5
0
    def test_content_disposition(self):
        writer = AntiClobberFileWriter(self.get_path_namer(),
                                       content_disposition=True)

        test_data = [
            ('hello1.txt', 'hello1.txt'),
            ('hello2.txt;', 'hello2.txt'),
            ('"hello3.txt"', 'hello3.txt'),
            ('\'hello4.txt\'', 'hello4.txt'),
        ]

        for raw_filename, filename in test_data:
            session = writer.session()

            request = HTTPRequest('http://example.com')
            response = HTTPResponse(status_code=200,
                                    reason='OK',
                                    request=request)
            response.fields[
                'Content-Disposition'] = 'attachment; filename={}'.format(
                    raw_filename)

            session.process_request(request)
            session.process_response(response)
            session.save_document(response)

            print(list(os.walk('.')))
            self.assertTrue(os.path.exists(filename))
Beispiel #6
0
    def test_adjust_extension(self):
        writer = AntiClobberFileWriter(self.get_path_namer(),
                                       adjust_extension=True)

        test_data = [
            ('text/html', '/mordor', 'mordor.html'),
            ('text/html', '/mordor?ring.asp', 'mordor?ring.asp.html'),
            ('text/html', '/mordor?ring.htm', 'mordor?ring.htm'),
            ('text/plain', '/static/my_file.txt', 'static/my_file.txt'),
            ('text/css', '/static/style.css', 'static/style.css'),
            ('text/css', '/static/style.css?hamster.exe',
             'static/style.css?hamster.exe.css'),
            ('text/html', '/static/mojibake.html', 'static/mojibake.html'),
            ('text/html', '/static/mojibake.html?dolphin.png',
             'static/mojibake.html?dolphin.png.html'),
        ]

        for mime_type, path, filename in test_data:
            session = writer.session()

            request = HTTPRequest('http://example.com' + path)
            response = HTTPResponse(status_code=200,
                                    reason='OK',
                                    request=request)
            response.fields['Content-Type'] = mime_type

            session.process_request(request)
            session.process_response(response)
            session.save_document(response)

            print(filename, list(os.walk('.')))
            self.assertTrue(os.path.exists(filename))
Beispiel #7
0
    def test_single_document_writer(self):
        stream = io.BytesIO()

        writer = SingleDocumentWriter(stream, headers_included=True)
        session = writer.session()

        request1 = HTTPRequest('http://example.com/my_file1.txt')
        response1 = HTTPResponse(status_code=200,
                                 reason='OK',
                                 request=request1)

        session.process_request(request1)
        session.process_response(response1)

        response1.body.write(b'The content')

        session.save_document(response1)

        session = writer.session()

        request2 = HTTPRequest('http://example.com/my_file2.txt')
        response2 = HTTPResponse(status_code=200,
                                 reason='OK',
                                 request=request2)

        session.process_request(request2)
        session.process_response(response2)

        response1.body.write(b'Another thing')

        session.save_document(response2)

        data = stream.getvalue()

        self.assertIn(b'HTTP', data)
        self.assertIn(b'The content', data)
        self.assertIn(b'Another thing', data)
Beispiel #8
0
    def test_progress_http(self):
        progress = ProgressPrinter(stream=sys.stdout)

        request = HTTPRequest('http://example.com')
        response = HTTPResponse(206, 'OK')
        response.fields['Content-Size'] = '1024'
        response.fields['Content-Range'] = 'bytes 10-/2048'

        progress.update_from_begin_request(request)
        progress.update_from_begin_response(response)

        for dummy in range(100):
            progress.update_with_data(b'abc')

        progress.update_from_end_response(response)
Beispiel #9
0
    def test_dir_or_file_dir_got_first(self):
        writer = OverwriteFileWriter(self.get_path_namer())
        session = writer.session()

        os.mkdir('dir_or_file')

        request = HTTPRequest('http://example.com/dir_or_file')
        response = HTTPResponse(status_code=200, reason='OK', request=request)

        session.process_request(request)
        session.process_response(response)
        session.save_document(response)

        print(list(os.walk('.')))
        self.assertTrue(os.path.isdir('dir_or_file'))
        self.assertTrue(os.path.isfile('dir_or_file.f'))
Beispiel #10
0
    def test_warc_max_size_and_append(self):
        file_prefix = 'asdf'

        with open('asdf-00000.warc', 'w'):
            pass

        with open('asdf-00001.warc', 'w'):
            pass

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(compress=False,
                                      max_size=1,
                                      appending=True),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('asdf-00000.warc'))
        self.assertTrue(os.path.exists('asdf-00001.warc'))
        self.assertTrue(os.path.exists('asdf-00002.warc'))
        self.assertTrue(os.path.exists('asdf-00003.warc'))
        self.assertTrue(os.path.exists('asdf-meta.warc'))

        self.assertEqual(0, os.path.getsize('asdf-00000.warc'))
        self.assertEqual(0, os.path.getsize('asdf-00001.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00002.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-00003.warc'))
        self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
Beispiel #11
0
    def test_warc_recorder_rollback(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        with open(warc_filename, 'wb') as warc_file:
            warc_file.write(b'a' * 10)

        warc_recorder = WARCRecorder(warc_prefix,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())

        class BadRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                for dummy in range(1000):
                    yield b"where's my elephant?"
                raise OSError('Oops')

        session._request_record = BadRecord(session._request_record)
        original_offset = os.path.getsize(warc_filename)

        with self.assertRaises((OSError, IOError)):
            session.end_request(request)

        new_offset = os.path.getsize(warc_filename)
        self.assertEqual(new_offset, original_offset)
        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))

        _logger.debug('original offset {0}'.format(original_offset))
Beispiel #12
0
    def test_timestamping(self):
        writer = TimestampingFileWriter(self.get_path_namer())
        session = writer.session()

        local_timestamp = 634521600

        with open('my_file.txt', 'wb') as file:
            file.write(b'')

        os.utime('my_file.txt', (local_timestamp, local_timestamp))

        request = HTTPRequest('http://example.com/my_file.txt')
        session.process_request(request)

        self.assertIn('If-Modified-Since', request.fields)

        response = HTTPResponse(status_code=304,
                                reason='Not modified',
                                request=request)
        session.process_response(response)
Beispiel #13
0
    def test_warc_recorder_journal(self):
        warc_filename = 'asdf.warc'
        warc_prefix = 'asdf'

        warc_recorder = WARCRecorder(warc_prefix,
                                     params=WARCRecorderParams(
                                         compress=False, ))

        request = HTTPRequest('http://example.com/')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        test_instance = self

        class MockRecord(WARCRecord):
            def __init__(self, original_record):
                super().__init__()
                self.block_file = original_record.block_file
                self.fields = original_record.fields

            def __iter__(self):
                print(list(os.walk('.')))
                test_instance.assertTrue(
                    os.path.exists(warc_filename + '-wpullinc'))

                for dummy in range(1000):
                    yield b"where's my elephant?"

        session._request_record = MockRecord(session._request_record)

        session.end_request(request)

        self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
Beispiel #14
0
    def test_warc_move_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        os.mkdir('./blah/')

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                cdx=True,
                move_to='./blah/',
                max_size=1,
            ),
        )

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'BLAH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        warc_recorder.close()

        self.assertTrue(os.path.exists('./blah/asdf-00000.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-00001.warc'))
        self.assertTrue(os.path.exists('./blah/asdf-meta.warc'))
        self.assertTrue(os.path.exists('./blah/' + cdx_filename))
Beispiel #15
0
    def test_warc_recorder_max_size(self):
        file_prefix = 'asdf'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(file_prefix,
                                     params=WARCRecorderParams(
                                         compress=False,
                                         extra_fields=[('Extra-field',
                                                        'my_extra_field')],
                                         cdx=True,
                                         max_size=1,
                                     ))

        request = HTTPRequest('http://example.com/1')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/2')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'DOGE KITTEH')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf-00000.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)

        with open('asdf-00001.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo', warc_file_content)
        self.assertIn(b'DOGE KITTEH', warc_file_content)

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')

        print(cdx_lines)

        self.assertEqual(4, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))

        self.assertIn(b'http://example.com/1', cdx_file_content)
        self.assertIn(b'http://example.com/2', cdx_file_content)

        with open('asdf-meta.warc', 'rb') as in_file:
            meta_file_content = in_file.read()

        self.assertIn(b'FINISHED', meta_file_content)

        self.validate_warc('asdf-00000.warc')
        self.validate_warc('asdf-00001.warc')
        self.validate_warc('asdf-meta.warc')
Beispiel #16
0
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder('asdf',
                                     params=WARCRecorderParams(
                                         compress=False,
                                         cdx=True,
                                         url_table=url_table))

        url_table.add_visits([
            ('http://example.com/fennec',
             '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
             'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ')
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n', warc_file_content)
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n',
                      warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n',
                      warc_file_content)
        self.assertEqual(1, warc_file_content.count(b'kitbit'))

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
Beispiel #17
0
    def test_warc_recorder(self):
        file_prefix = 'asdf'
        warc_filename = 'asdf.warc'
        cdx_filename = 'asdf.cdx'

        warc_recorder = WARCRecorder(
            file_prefix,
            params=WARCRecorderParams(
                compress=False,
                extra_fields=[('Extra-field', 'my_extra_field')],
                cdx=True,
            ),
        )

        request = HTTPRequest('http://example.com/')
        request.prepare_for_send()
        request.address = ('0.0.0.0', 80)
        request.prepare_for_send()
        response = HTTPResponse(200, 'OK')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'KITTEH DOGE')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open(warc_filename, 'rb') as in_file:
            warc_file_content = in_file.read()

        with open(cdx_filename, 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content)
        self.assertIn(b'Content-Type: application/warc-fields',
                      warc_file_content)
        self.assertIn(b'WARC-Date: ', warc_file_content)
        self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content)
        self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content)
        self.assertIn(b'WARC-Type: request\r\n', warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=request',
                      warc_file_content)
        self.assertIn(b'WARC-Type: response', warc_file_content)
        self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content)
        self.assertIn(b'Content-Type: application/http;msgtype=response',
                      warc_file_content)
        self.assertIn(
            'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'),
            warc_file_content)
        self.assertIn(
            'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'),
            warc_file_content)
        self.assertIn(b'Extra-Field: my_extra_field', warc_file_content)
        self.assertIn(b'GET / HTTP', warc_file_content)
        self.assertIn(b'KITTEH DOGE', warc_file_content)
        self.assertIn(b'FINISHED', warc_file_content)
        self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content)
        self.assertIn(b'Content-Length:', warc_file_content)
        self.assertNotIn(b'Content-Length: 0', warc_file_content)

        cdx_lines = cdx_file_content.split(b'\n')
        cdx_labels = cdx_lines[0].strip().split(b' ')
        cdx_fields = cdx_lines[1].split(b' ')

        print(cdx_lines)

        self.assertEqual(3, len(cdx_lines))
        self.assertEqual(10, len(cdx_labels))
        self.assertEqual(9, len(cdx_fields))
        self.assertTrue(cdx_lines[0].startswith(b' CDX'))

        self.assertEqual(b'http://example.com/', cdx_fields[0])
        self.assertEqual(b'-', cdx_fields[2])
        self.assertEqual(b'200', cdx_fields[3])
        self.assertNotEqual(b'-', cdx_fields[4])
        self.assertNotEqual(b'0', cdx_fields[5])
        self.assertNotEqual(b'0', cdx_fields[6])
        self.assertEqual(os.path.basename(warc_filename),
                         cdx_fields[7].decode('ascii'))

        length = int(cdx_fields[5])
        offset = int(cdx_fields[6])

        with open(warc_filename, 'rb') as in_file:
            in_file.seek(offset)
            data = in_file.read(length)

            assert len(data) == length

        self.assertEqual(b'WARC/1.0', data[:8])

        self.assertIn(b'KITTEH DOGE', data)

        self.validate_warc(warc_filename)