def test_request(self): request = Request('http://example.com/robots.txt') request.prepare_for_send() self.assertEqual( (b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com\r\n' b'\r\n'), request.to_bytes() )
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content) self.assertIn( 'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'), warc_file_content) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual(os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content ) self.assertIn( 'Python/{0}'.format( wpull.util.python_version()).encode('utf-8'), warc_file_content ) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual( os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)
def test_request_port(self): request = Request('https://example.com:4567/robots.txt') request.prepare_for_send() self.assertEqual((b'GET /robots.txt HTTP/1.1\r\n' b'Host: example.com:4567\r\n' b'\r\n'), request.to_bytes())