コード例 #1
0
ファイル: warc_test.py プロジェクト: flatron18116/wpull
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder('asdf',
                                     params=WARCRecorderParams(
                                         compress=False,
                                         cdx=True,
                                         url_table=url_table))

        url_table.add_visits([
            ('http://example.com/fennec',
             '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
             'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ')
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.to_bytes())
            session.response_data(response.body.content())
            session.response(response)

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        with warc_recorder.session() as session:
            session.pre_request(request)
            session.request_data(request.to_bytes())
            session.request(request)
            session.pre_response(response)
            session.response_data(response.to_bytes())
            session.response_data(response.body.content())
            session.response(response)

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n', warc_file_content)
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n',
                      warc_file_content)
        self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n',
                      warc_file_content)
        self.assertEqual(1, warc_file_content.count(b'kitbit'))

        self.assertIn(b'http://example.com/horse ', cdx_file_content)
コード例 #2
0
ファイル: recorder_test.py プロジェクト: Super-Rad/wpull
    def test_cdx_dedup(self):
        url_table = URLTable()
        warc_recorder = WARCRecorder(
            'asdf',
            params=WARCRecorderParams(
                compress=False, cdx=True, url_table=url_table
            )
        )

        url_table.add_visits([
            (
                'http://example.com/fennec',
                '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>',
                'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ'
            )
        ])

        request = HTTPRequest('http://example.com/fennec')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OK')
        response.body = Body()
        revisit_response_header_size = len(response.to_bytes())

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        request = HTTPRequest('http://example.com/horse')
        request.address = ('0.0.0.0', 80)
        response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa')
        response.body = Body()

        with wpull.util.reset_file_offset(response.body):
            response.body.write(b'kitbit')

        session = warc_recorder.new_http_recorder_session()
        session.begin_request(request)
        session.request_data(request.to_bytes())
        session.end_request(request)
        session.begin_response(response)
        session.response_data(response.to_bytes())
        session.response_data(response.body.content())
        session.end_response(response)
        session.close()

        _logger.info('FINISHED')

        warc_recorder.close()

        with open('asdf.warc', 'rb') as in_file:
            warc_file_content = in_file.read()

        with open('asdf.cdx', 'rb') as in_file:
            cdx_file_content = in_file.read()

        self.assertTrue(warc_file_content.startswith(b'WARC/1.0'))
        self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Refers-To: '
            b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n',
            warc_file_content
        )
        self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content)
        self.assertIn(
            b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/'
            b'identical-payload-digest\r\n',
            warc_file_content
        )
        self.assertIn(
            b'Content-Length: ' +
            str(revisit_response_header_size).encode('ascii') + b'\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/fennec\r\n',
            warc_file_content
        )
        self.assertIn(
            b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content
        )
        self.assertEqual(
            1,
            warc_file_content.count(b'kitbit')
        )

        self.assertIn(b'http://example.com/horse ', cdx_file_content)