def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder('asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table)) url_table.add_visits([ ('http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ') ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content) self.assertEqual(1, warc_file_content.count(b'kitbit')) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder( 'asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table ) ) url_table.add_visits([ ( 'http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ' ) ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content ) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content ) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content ) self.assertEqual( 1, warc_file_content.count(b'kitbit') ) self.assertIn(b'http://example.com/horse ', cdx_file_content)