def test_css_converter(self): url_table = URLTable() css_filename = os.path.join(self.temp_dir.name, 'styles.css') image_filename = os.path.join(self.temp_dir.name, 'image.png') new_css_filename = os.path.join(self.temp_dir.name, 'styles.css-new') url_table.add_many([ {'url': 'http://example.com/styles.css'}, {'url': 'http://example.com/image.png'}, {'url': 'http://example.com/cat.jpg'}, {'url': 'http://example.com/cat.jpg'}, ]) url_table.update_one( 'http://example.com/styles.css', status=Status.done, link_type='css', filename=os.path.relpath(css_filename, self.temp_dir.name) ) url_table.update_one( 'http://example.com/image.png', status=Status.done, filename=os.path.relpath(image_filename, self.temp_dir.name) ) with open(css_filename, 'w') as out_file: out_file.write(CSS_TEXT) with open(image_filename, 'wb'): pass converter = CSSConverter(url_table) converter.convert( css_filename, new_css_filename, base_url='http://example.com/styles.css' ) with open(new_css_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text)
def test_css_converter(self): url_table = URLTable() css_filename = os.path.join(self.temp_dir.name, "styles.css") image_filename = os.path.join(self.temp_dir.name, "image.png") new_css_filename = os.path.join(self.temp_dir.name, "styles.css-new") url_table.add_many( [ AddURLInfo("http://example.com/styles.css", None, None), AddURLInfo("http://example.com/image.png", None, None), AddURLInfo("http://example.com/cat.jpg", None, None), AddURLInfo("http://example.com/cat.jpg", None, None), ] ) url_table.update_one( "http://example.com/styles.css", status=Status.done.value, link_type="css", filename=os.path.relpath(css_filename, self.temp_dir.name), ) url_table.update_one( "http://example.com/image.png", status=Status.done.value, filename=os.path.relpath(image_filename, self.temp_dir.name), ) with open(css_filename, "w") as out_file: out_file.write(CSS_TEXT) with open(image_filename, "wb"): pass converter = CSSConverter(url_table) converter.convert(css_filename, new_css_filename, base_url="http://example.com/styles.css") with open(new_css_filename, "r") as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text)
def test_xhtml_converter(self): url_table = URLTable() image_filename = os.path.join(self.temp_dir.name, 'image.png') tubes_filename = os.path.join(self.temp_dir.name, 'tubes.html') ferret_filename = os.path.join(self.temp_dir.name, 'ferret.jpg') url_table.add_many([ {'url': 'http://example.com/styles.css'}, {'url': 'http://example.com/image.png'}, {'url': 'http://example.com/cat.jpg'}, {'url': 'http://example.com/fox.jpg'}, {'url': 'http://example.com/ferret.jpg'}, {'url': 'http://example.com/tubes.html'}, ]) url_table.update_one( 'http://example.com/styles.css', status=Status.done, link_type='css' ) url_table.update_one( 'http://example.com/image.png', status=Status.done, filename=os.path.relpath(image_filename, self.temp_dir.name) ) url_table.update_one( 'http://example.com/tubes.html', status=Status.done, filename=os.path.relpath(tubes_filename, self.temp_dir.name) ) url_table.update_one( 'http://example.com/ferret.jpg', status=Status.done, filename=os.path.relpath(ferret_filename, self.temp_dir.name) ) html_filename = os.path.join(self.temp_dir.name, 'index.html') new_html_filename = os.path.join(self.temp_dir.name, 'index.html-new') with open(html_filename, 'w') as out_file: out_file.write(XHTML_TEXT) for filename in [image_filename, tubes_filename, ferret_filename]: with open(filename, 'wb'): pass element_walker = ElementWalker(css_scraper=CSSScraper()) converter = HTMLConverter( self.get_html_parser(), element_walker, url_table) converter.convert( html_filename, new_html_filename, base_url='http://example.com/index.html' ) with open(new_html_filename, 'r') as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text) self.assertIn('"tubes.html"', converted_text) self.assertIn('"http://example.com/lol.html"', converted_text) self.assertIn("url('http://example.com/fox.jpg')", converted_text) self.assertIn("url('ferret.jpg')", converted_text) self.assertIn("hello world!!", converted_text) self.assertIn("<hr/>", converted_text)
def test_xhtml_converter(self): url_table = URLTable() image_filename = os.path.join(self.temp_dir.name, "image.png") tubes_filename = os.path.join(self.temp_dir.name, "tubes.html") ferret_filename = os.path.join(self.temp_dir.name, "ferret.jpg") url_table.add_many( [ AddURLInfo("http://example.com/styles.css", None, None), AddURLInfo("http://example.com/image.png", None, None), AddURLInfo("http://example.com/cat.jpg", None, None), AddURLInfo("http://example.com/fox.jpg", None, None), AddURLInfo("http://example.com/ferret.jpg", None, None), AddURLInfo("http://example.com/tubes.html", None, None), ] ) url_table.update_one("http://example.com/styles.css", status=Status.done.value, link_type="css") url_table.update_one( "http://example.com/image.png", status=Status.done.value, filename=os.path.relpath(image_filename, self.temp_dir.name), ) url_table.update_one( "http://example.com/tubes.html", status=Status.done.value, filename=os.path.relpath(tubes_filename, self.temp_dir.name), ) url_table.update_one( "http://example.com/ferret.jpg", status=Status.done.value, filename=os.path.relpath(ferret_filename, self.temp_dir.name), ) html_filename = os.path.join(self.temp_dir.name, "index.html") new_html_filename = os.path.join(self.temp_dir.name, "index.html-new") with open(html_filename, "w") as out_file: out_file.write(XHTML_TEXT) for filename in [image_filename, tubes_filename, ferret_filename]: with open(filename, "wb"): pass element_walker = ElementWalker(css_scraper=CSSScraper()) converter = HTMLConverter(self.get_html_parser(), element_walker, url_table) converter.convert(html_filename, new_html_filename, base_url="http://example.com/index.html") with open(new_html_filename, "r") as in_file: converted_text = in_file.read() self.assertIn("url('image.png')", converted_text) self.assertIn("url('http://example.com/cat.jpg')", converted_text) self.assertIn('"tubes.html"', converted_text) self.assertIn('"http://example.com/lol.html"', converted_text) self.assertIn("url('http://example.com/fox.jpg')", converted_text) self.assertIn("url('ferret.jpg')", converted_text) self.assertIn("hello world!!", converted_text) self.assertIn("<hr/>", converted_text)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder('asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table)) url_table.add_visits([ ('http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ') ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content) self.assertEqual(1, warc_file_content.count(b'kitbit')) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder( 'asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table ) ) url_table.add_visits([ ( 'http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ' ) ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') session = warc_recorder.new_http_recorder_session() session.begin_request(request) session.request_data(request.to_bytes()) session.end_request(request) session.begin_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.end_response(response) session.close() _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content ) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content ) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content ) self.assertEqual( 1, warc_file_content.count(b'kitbit') ) self.assertIn(b'http://example.com/horse ', cdx_file_content)