def test_css_converter(self): with cd_tempdir() as temp_dir: url_table = URLTable() css_filename = os.path.join(temp_dir, 'styles.css') image_filename = os.path.join(temp_dir, 'image.png') new_css_filename = os.path.join(temp_dir, 'styles.css-new') url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/cat.jpg', ]) url_table.update( 'http://example.com/styles.css', status=Status.done, link_type='css', filename=os.path.relpath(css_filename, temp_dir) ) url_table.update( 'http://example.com/image.png', status=Status.done, filename=os.path.relpath(image_filename, temp_dir) ) with open(css_filename, 'w') as out_file: out_file.write(CSS_TEXT) with open(image_filename, 'wb'): pass converter = CSSConverter(url_table) converter.convert( css_filename, new_css_filename, base_url='http://example.com/styles.css' ) with open(new_css_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text)
def test_css_converter(self): with TemporaryDirectory() as temp_dir: path_namer = PathNamer(temp_dir) url_table = URLTable() url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/cat.jpg', ]) url_table.update('http://example.com/styles.css', status=Status.done, link_type='css') url_table.update( 'http://example.com/image.png', status=Status.done, ) css_filename = os.path.join(temp_dir, 'styles.css') new_css_filename = os.path.join(temp_dir, 'styles.css-new') with open(css_filename, 'w') as out_file: out_file.write(CSS_TEXT) converter = CSSConverter(path_namer, url_table) converter.convert(css_filename, new_css_filename, base_url='http://example.com/styles.css') with open(new_css_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text)
def test_css_converter(self): with TemporaryDirectory() as temp_dir: path_namer = PathNamer(temp_dir) url_table = URLTable() url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/cat.jpg', ]) url_table.update( 'http://example.com/styles.css', status=Status.done, link_type='css' ) url_table.update( 'http://example.com/image.png', status=Status.done, ) css_filename = os.path.join(temp_dir, 'styles.css') new_css_filename = os.path.join(temp_dir, 'styles.css-new') with open(css_filename, 'w') as out_file: out_file.write(CSS_TEXT) converter = CSSConverter(path_namer, url_table) converter.convert( css_filename, new_css_filename, base_url='http://example.com/styles.css' ) with open(new_css_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder( 'asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table ) ) url_table.add_visits([ ( 'http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ' ) ]) request = Request.new('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = Response('HTTP/1.1', '200', 'OK') revisit_response_header_size = len(response.header()) with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.header()) session.request(request) session.pre_response(response) session.response_data(response.header()) session.response_data(response.body.content) session.response(response) request = Request.new('http://example.com/horse') request.address = ('0.0.0.0', 80) response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.header()) session.request(request) session.pre_response(response) session.response_data(response.header()) session.response_data(response.body.content) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content ) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content ) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content ) self.assertEqual( 1, warc_file_content.count(b'kitbit') ) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_html_converter(self): with TemporaryDirectory() as temp_dir: path_namer = PathNamer(temp_dir) url_table = URLTable() url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/fox.jpg', 'http://example.com/ferret.jpg', 'http://example.com/tubes.html', ]) url_table.update('http://example.com/styles.css', status=Status.done, link_type='css') url_table.update( 'http://example.com/image.png', status=Status.done, ) url_table.update( 'http://example.com/tubes.html', status=Status.done, ) url_table.update( 'http://example.com/ferret.jpg', status=Status.done, ) html_filename = os.path.join(temp_dir, 'index.html') new_html_filename = os.path.join(temp_dir, 'index.html-new') with open(html_filename, 'w') as out_file: out_file.write(HTML_TEXT) converter = HTMLConverter(path_namer, url_table) converter.convert(html_filename, new_html_filename, base_url='http://example.com/index.html') with open(new_html_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text) self.assertIn('"tubes.html"', converted_text) self.assertIn('"http://example.com/lol.html"', converted_text) self.assertIn("url('http://example.com/fox.jpg')", converted_text) self.assertIn("url('ferret.jpg')", converted_text)
def test_xhtml_converter(self): with cd_tempdir() as temp_dir: url_table = URLTable() image_filename = os.path.join(temp_dir, 'image.png') tubes_filename = os.path.join(temp_dir, 'tubes.html') ferret_filename = os.path.join(temp_dir, 'ferret.jpg') url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/fox.jpg', 'http://example.com/ferret.jpg', 'http://example.com/tubes.html', ]) url_table.update( 'http://example.com/styles.css', status=Status.done, link_type='css' ) url_table.update( 'http://example.com/image.png', status=Status.done, filename=os.path.relpath(image_filename, temp_dir) ) url_table.update( 'http://example.com/tubes.html', status=Status.done, filename=os.path.relpath(tubes_filename, temp_dir) ) url_table.update( 'http://example.com/ferret.jpg', status=Status.done, filename=os.path.relpath(ferret_filename, temp_dir) ) html_filename = os.path.join(temp_dir, 'index.html') new_html_filename = os.path.join(temp_dir, 'index.html-new') with open(html_filename, 'w') as out_file: out_file.write(XHTML_TEXT) for filename in [image_filename, tubes_filename, ferret_filename]: with open(filename, 'wb'): pass converter = HTMLConverter(url_table) converter.convert( html_filename, new_html_filename, base_url='http://example.com/index.html' ) with open(new_html_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text) self.assertIn('"tubes.html"', converted_text) self.assertIn('"http://example.com/lol.html"', converted_text) self.assertIn("url('http://example.com/fox.jpg')", converted_text) self.assertIn("url('ferret.jpg')", converted_text) self.assertIn("hello world!!", converted_text) self.assertIn("<hr/>", converted_text)
def test_html_converter(self): with TemporaryDirectory() as temp_dir: path_namer = PathNamer(temp_dir) url_table = URLTable() url_table.add([ 'http://example.com/styles.css', 'http://example.com/image.png', 'http://example.com/cat.jpg', 'http://example.com/fox.jpg', 'http://example.com/ferret.jpg', 'http://example.com/tubes.html', ]) url_table.update( 'http://example.com/styles.css', status=Status.done, link_type='css' ) url_table.update( 'http://example.com/image.png', status=Status.done, ) url_table.update( 'http://example.com/tubes.html', status=Status.done, ) url_table.update( 'http://example.com/ferret.jpg', status=Status.done, ) html_filename = os.path.join(temp_dir, 'index.html') new_html_filename = os.path.join(temp_dir, 'index.html-new') with open(html_filename, 'w') as out_file: out_file.write(HTML_TEXT) converter = HTMLConverter(path_namer, url_table) converter.convert( html_filename, new_html_filename, base_url='http://example.com/index.html' ) with open(new_html_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text) self.assertIn('"tubes.html"', converted_text) self.assertIn('"http://example.com/lol.html"', converted_text) self.assertIn("url('http://example.com/fox.jpg')", converted_text) self.assertIn("url('ferret.jpg')", converted_text)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder('asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table)) url_table.add_visits([ ('http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ') ]) request = Request.new('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = Response('HTTP/1.1', '200', 'OK') revisit_response_header_size = len(response.header()) with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.header()) session.request(request) session.pre_response(response) session.response_data(response.header()) session.response_data(response.body.content) session.response(response) request = Request.new('http://example.com/horse') request.address = ('0.0.0.0', 80) response = Response('HTTP/1.1', '200', 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.header()) session.request(request) session.pre_response(response) session.response_data(response.header()) session.response_data(response.body.content) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content) self.assertEqual(1, warc_file_content.count(b'kitbit')) self.assertIn(b'http://example.com/horse ', cdx_file_content)