def test_to_dict_body(self): request = Request() request.body = Body() request_dict = request.to_dict() self.assertTrue(request_dict['body']) request.body.close() request = Request() request.body = NotImplemented request_dict = request.to_dict() self.assertFalse(request_dict['body']) response = Response() response.body = Body() response_dict = response.to_dict() self.assertTrue(response_dict['body']) response.body.close() response = Response() response.body = NotImplemented response_dict = response.to_dict() self.assertFalse(response_dict['body'])
def test_to_dict_body(self): request = Request() request.body = Body() request_dict = request.to_dict() self.assertTrue(request_dict['body']) request.body.close() request = Request() request.body = NotImplemented request_dict = request.to_dict() self.assertFalse(request_dict['body']) response = Response() response.body = Body() response_dict = response.to_dict() self.assertTrue(response_dict['body']) response.body.close() response = Response() response.body = NotImplemented response_dict = response.to_dict() self.assertFalse(response_dict['body'])
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''' ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls ) self.assertFalse(inline_urls)
def test_rss_as_html(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse( inline_urls ) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls )
def test_xhtml_invalid(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'xhtml_invalid.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( { 'http://example.com/image.png', 'http://example.com/script.js', }, inline_urls ) self.assertEqual( { 'http://example.com/link' }, linked_urls )
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''' ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls ) self.assertFalse(inline_urls)
def test_javascript_heavy_inline_monstrosity(self): scraper = JavaScriptScraper() request = Request('http://example.com/test.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: in_file.seek(0x147) shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls ) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls ) print('\n'.join(inline_urls)) print('\n'.join(linked_urls))
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_html_soup(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( {'http://example.com/ABOUTM~1.JPG'}, inline_urls ) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls )
def test_html_krokozyabry(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=KOI8-R' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'krokozyabry.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('koi8-r', scrape_result.encoding) self.assertEqual( set(), inline_urls ) self.assertEqual( {'http://example.com/Кракозябры'}, linked_urls )
def test_html_not_quite_charset(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'videogame_top.htm') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn( 'http://example.com/copyright_2001_2006_rtype.gif', inline_urls ) self.assertIn( 'http://www.geocities.jp/gamehouse_grindcrusher/', linked_urls )
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._child_session._request_record = \ BadRecord(session._child_session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_css_scraper_reject_type(self): scraper = CSSScraper() request = Request('http://example.com/styles.css') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'styles.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.html) self.assertFalse(scrape_result)
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, max_size=1, appending=True ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_sitemap_scraper_invalid_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(linked_urls) self.assertFalse(inline_urls)
def test_javascript_reject_type(self): scraper = JavaScriptScraper() request = Request('http://example.com/script.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'script.js') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_warc_recorder_rollback(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' with open(warc_filename, 'wb') as warc_file: warc_file.write(b'a' * 10) warc_recorder = WARCRecorder(warc_prefix, params=WARCRecorderParams( compress=False, )) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) class BadRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): for dummy in range(1000): yield b"where's my elephant?" raise OSError('Oops') session._child_session._request_record = \ BadRecord(session._child_session._request_record) original_offset = os.path.getsize(warc_filename) with self.assertRaises((OSError, IOError)): session.request(request) new_offset = os.path.getsize(warc_filename) self.assertEqual(new_offset, original_offset) self.assertFalse(os.path.exists(warc_filename + '-wpullinc')) _logger.debug('original offset {0}'.format(original_offset))
def test_sitemap_scraper_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'Sitemap: http://example.com/sitemap00.xml') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/sitemap00.xml', }, linked_urls) self.assertFalse(inline_urls)
def test_sitemap_scraper_invalid_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk' ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(linked_urls) self.assertFalse(inline_urls)
def test_warc_max_size_and_append(self): file_prefix = 'asdf' with open('asdf-00000.warc', 'w'): pass with open('asdf-00001.warc', 'w'): pass warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams(compress=False, max_size=1, appending=True), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) warc_recorder.close() self.assertTrue(os.path.exists('asdf-00000.warc')) self.assertTrue(os.path.exists('asdf-00001.warc')) self.assertTrue(os.path.exists('asdf-00002.warc')) self.assertTrue(os.path.exists('asdf-00003.warc')) self.assertTrue(os.path.exists('asdf-meta.warc')) self.assertEqual(0, os.path.getsize('asdf-00000.warc')) self.assertEqual(0, os.path.getsize('asdf-00001.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00002.warc')) self.assertNotEqual(0, os.path.getsize('asdf-00003.warc')) self.assertNotEqual(0, os.path.getsize('asdf-meta.warc'))
def test_warc_recorder_journal(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' warc_recorder = WARCRecorder( warc_prefix, params=WARCRecorderParams( compress=False, ) ) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) test_instance = self class MockRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): print(list(os.walk('.'))) test_instance.assertTrue( os.path.exists(warc_filename + '-wpullinc') ) for dummy in range(1000): yield b"where's my elephant?" session._child_session._request_record = \ MockRecord(session._child_session._request_record) session.request(request) self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
def test_html_scraper_reject_type(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_html_garbage(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html' with wpull.util.reset_file_offset(response.body): response.body.write( b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f' b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo' b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7' ) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_encoding_lxml_name_mismatch(self): '''It should accept encoding names with underscore.''' element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=EUC_KR' with wpull.util.reset_file_offset(response.body): response.body.write( '힖'.encode('euc_kr') ) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) self.assertEqual('euc_kr', scrape_info['encoding'])
def test_html_wrong_charset(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'kcna.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-16-le', scrape_result.encoding) self.assertEqual( { 'http://example.com/utm/__utm.js', 'http://example.com/Knewskage.gif', 'http://example.com/Lline.gif', 'http://example.com/Sline.gif', 'http://example.com/korean01.gif', 'http://example.com/korean02.gif', 'http://example.com/english01.gif', 'http://example.com/english02.gif', 'http://example.com/Tongsinkage.gif', 'http://example.com/Knewskage.gif', }, inline_urls ) self.assertEqual( { 'http://example.com/index-k.htm', 'http://example.com/index-e.htm', }, linked_urls )
def test_html_serious_bad_encoding(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker, encoding_override='utf8') request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=utf8' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'xkcd_1_evil.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_css_scraper_mojibake(self): scraper = CSSScraper() request = Request('http://example.com/styles.css') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'mojibake.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/文字化け.png', }, inline_urls) self.assertFalse(linked_urls)
def test_warc_recorder_journal(self): warc_filename = 'asdf.warc' warc_prefix = 'asdf' warc_recorder = WARCRecorder(warc_prefix, params=WARCRecorderParams( compress=False, )) request = HTTPRequest('http://example.com/') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) test_instance = self class MockRecord(WARCRecord): def __init__(self, original_record): super().__init__() self.block_file = original_record.block_file self.fields = original_record.fields def __iter__(self): print(list(os.walk('.'))) test_instance.assertTrue( os.path.exists(warc_filename + '-wpullinc')) for dummy in range(1000): yield b"where's my elephant?" session._child_session._request_record = \ MockRecord(session._child_session._request_record) session.request(request) self.assertFalse(os.path.exists(warc_filename + '-wpullinc'))
def test_sitemap_scraper_robots(self): scraper = SitemapScraper(self.get_html_parser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'Sitemap: http://example.com/sitemap00.xml' ) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/sitemap00.xml', }, linked_urls ) self.assertFalse(inline_urls)
def test_warc_move_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' os.mkdir('./blah/') warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, cdx=True, move_to='./blah/', max_size=1, ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) warc_recorder.close() self.assertTrue(os.path.exists('./blah/asdf-00000.warc')) self.assertTrue(os.path.exists('./blah/asdf-00001.warc')) self.assertTrue(os.path.exists('./blah/asdf-meta.warc')) self.assertTrue(os.path.exists('./blah/' + cdx_filename))
def test_css_scraper_mojibake(self): scraper = CSSScraper() request = Request('http://example.com/styles.css') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'mojibake.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/文字化け.png', }, inline_urls ) self.assertFalse(linked_urls)
def test_javascript_scraper(self): scraper = JavaScriptScraper() request = Request('http://example.com/script.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'script.js') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/script_variable.png', 'http://example.com/dragonquery.js', }, inline_urls ) self.assertEqual({ 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', }, linked_urls )
def test_warc_move_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' os.mkdir('./blah/') warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, cdx=True, move_to='./blah/', max_size=1, ), ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'BLAH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) warc_recorder.close() self.assertTrue(os.path.exists('./blah/asdf-00000.warc')) self.assertTrue(os.path.exists('./blah/asdf-00001.warc')) self.assertTrue(os.path.exists('./blah/asdf-meta.warc')) self.assertTrue(os.path.exists('./blah/' + cdx_filename))
def test_javascript_scraper(self): scraper = JavaScriptScraper() request = Request('http://example.com/script.js') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'script.js') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( { 'http://example.com/script_variable.png', 'http://example.com/dragonquery.js', }, inline_urls) self.assertEqual( { 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', }, linked_urls)
def test_html_scraper_links_base_href(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual({ 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls ) self.assertEqual({ 'http://example.com/a/' }, linked_urls )
def test_html_scraper_links(self): element_walker = ElementWalker( css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(self.get_html_parser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() response.fields['Refresh'] = '3; url=header_refresh.html' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual({ 'http://example.com/style_import_url.css', 'http://example.com/style_import_quote_url.css', 'http://example.com/style_single_quote_import.css', 'http://example.com/style_double_quote_import.css', 'http://example.com/link_href.css', 'http://example.com/script.js', 'http://example.com/body_background.png', 'http://example.com/images/table_background.png', 'http://example.com/images/td_background.png', 'http://example.com/images/th_background.png', 'http://example.com/style_url1.png', 'http://example.com/style_url2.png', 'http://example.com/applet/', # returned by lxml 'http://example.com/applet/applet_code.class', 'http://example.com/applet/applet_src.class', 'http://example.com/bgsound.mid', 'http://example.com/audio_src.wav', 'http://example.net/source_src.wav', 'http://example.com/embed_src.mov', 'http://example.com/fig_src.png', 'http://example.com/frame_src.html', 'http://example.com/iframe_src.html', 'http://example.com/img_href.png', 'http://example.com/img_lowsrc.png', 'http://example.com/img_src.png', 'http://example.com/img_data.png', 'http://example.com/img_srcset_1.jpeg', 'http://example.com/img_srcset_2.jpeg', 'http://example.com/img_srcset_3.jpeg', 'http://example.com/input_src.png', 'http://example.com/layer_src.png', 'http://example.com/object/', # returned by lxml 'http://example.com/object/object_data.swf', 'http://example.com/object/object_archive.dat', 'mailto:internet', 'object_not_url_codebase', 'http://example.com/param_ref_value.php', 'http://example.com/overlay_src.html', 'http://example.com/script_variable.png', }, inline_urls ) self.assertEqual({ 'http://example.com/og_image.png', 'http://example.com/og_url.html', 'http://example.com/og_audio.mp3', 'http://example.com/og_video.webm', 'http://example.com/twitter_image.png', 'http://example.com/twitter_image0.png', 'http://example.com/twitter_image1.png', 'http://example.com/twitter_image2.png', 'http://example.com/twitter_image3.png', 'http://example.com/twitter_player.html', 'http://example.com/twitter_stream.mp4', 'http://example.net/soup.html', 'http://example.com/a_href.html', 'http://example.com/area_href.html', 'http://example.com/frame_src.html', 'http://example.com/embed_href.html', 'http://example.com/embed_src.mov', 'http://example.com/form_action.html', 'http://example.com/iframe_src.html', 'http://example.com/layer_src.png', 'http://example.com/overlay_src.html', 'ftp://ftp.protocol.invalid/', 'mailto:[email protected]', 'http://a-double-slash.example', 'http://example.com/header_refresh.html', 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', 'http://example.com/a_javascript_link.html', 'http://example.com/a_onclick_link.html', }, linked_urls ) for url in inline_urls | linked_urls: self.assertIsInstance(url, str)
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder('asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table)) url_table.add_visits([ ('http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ') ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content) self.assertEqual(1, warc_file_content.count(b'kitbit')) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content) self.assertIn( 'Python/{0}'.format(wpull.util.python_version()).encode('utf-8'), warc_file_content) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual(os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)
def test_warc_recorder_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, max_size=1, ) ) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/2') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'DOGE KITTEH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf-00000.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) with open('asdf-00001.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'DOGE KITTEH', warc_file_content) with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') print(cdx_lines) self.assertEqual(4, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertIn(b'http://example.com/1', cdx_file_content) self.assertIn(b'http://example.com/2', cdx_file_content) with open('asdf-meta.warc', 'rb') as in_file: meta_file_content = in_file.read() self.assertIn(b'FINISHED', meta_file_content) self.validate_warc('asdf-00000.warc') self.validate_warc('asdf-00001.warc') self.validate_warc('asdf-meta.warc')
def test_warc_recorder_max_size(self): file_prefix = 'asdf' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder(file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, max_size=1, )) request = HTTPRequest('http://example.com/1') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/2') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'DOGE KITTEH') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf-00000.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) with open('asdf-00001.warc', 'rb') as in_file: warc_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo', warc_file_content) self.assertIn(b'DOGE KITTEH', warc_file_content) with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') print(cdx_lines) self.assertEqual(4, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertIn(b'http://example.com/1', cdx_file_content) self.assertIn(b'http://example.com/2', cdx_file_content) with open('asdf-meta.warc', 'rb') as in_file: meta_file_content = in_file.read() self.assertIn(b'FINISHED', meta_file_content) self.validate_warc('asdf-00000.warc') self.validate_warc('asdf-00001.warc') self.validate_warc('asdf-meta.warc')
def test_cdx_dedup(self): url_table = URLTable() warc_recorder = WARCRecorder( 'asdf', params=WARCRecorderParams( compress=False, cdx=True, url_table=url_table ) ) url_table.add_visits([ ( 'http://example.com/fennec', '<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>', 'B62D734VFEKIDLFAB7TTSCSZF64BKAYJ' ) ]) request = HTTPRequest('http://example.com/fennec') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OK') response.body = Body() revisit_response_header_size = len(response.to_bytes()) with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) request = HTTPRequest('http://example.com/horse') request.address = ('0.0.0.0', 80) response = HTTPResponse(200, 'OKaaaaaaaaaaaaaaaaaaaaaaaaaa') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'kitbit') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open('asdf.warc', 'rb') as in_file: warc_file_content = in_file.read() with open('asdf.cdx', 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: revisit\r\n', warc_file_content) self.assertIn( b'WARC-Refers-To: ' b'<urn:uuid:8a534d31-bd06-4056-8a0f-bdc5fd611036>\r\n', warc_file_content ) self.assertIn(b'WARC-Truncated: length\r\n', warc_file_content) self.assertIn( b'WARC-Profile: http://netpreserve.org/warc/1.0/revisit/' b'identical-payload-digest\r\n', warc_file_content ) self.assertIn( b'Content-Length: ' + str(revisit_response_header_size).encode('ascii') + b'\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/fennec\r\n', warc_file_content ) self.assertIn( b'WARC-Target-URI: http://example.com/horse\r\n', warc_file_content ) self.assertEqual( 1, warc_file_content.count(b'kitbit') ) self.assertIn(b'http://example.com/horse ', cdx_file_content)
def test_warc_recorder(self): file_prefix = 'asdf' warc_filename = 'asdf.warc' cdx_filename = 'asdf.cdx' warc_recorder = WARCRecorder( file_prefix, params=WARCRecorderParams( compress=False, extra_fields=[('Extra-field', 'my_extra_field')], cdx=True, ), ) request = HTTPRequest('http://example.com/') request.prepare_for_send() request.address = ('0.0.0.0', 80) request.prepare_for_send() response = HTTPResponse(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'KITTEH DOGE') with warc_recorder.session() as session: session.pre_request(request) session.request_data(request.to_bytes()) session.request(request) session.pre_response(response) session.response_data(response.to_bytes()) session.response_data(response.body.content()) session.response(response) _logger.info('FINISHED') warc_recorder.close() with open(warc_filename, 'rb') as in_file: warc_file_content = in_file.read() with open(cdx_filename, 'rb') as in_file: cdx_file_content = in_file.read() self.assertTrue(warc_file_content.startswith(b'WARC/1.0')) self.assertIn(b'WARC-Type: warcinfo\r\n', warc_file_content) self.assertIn(b'Content-Type: application/warc-fields', warc_file_content) self.assertIn(b'WARC-Date: ', warc_file_content) self.assertIn(b'WARC-Record-ID: <urn:uuid:', warc_file_content) self.assertIn(b'WARC-Block-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Payload-Digest: sha1:', warc_file_content) self.assertIn(b'WARC-Type: request\r\n', warc_file_content) self.assertIn(b'WARC-Target-URI: http://', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=request', warc_file_content) self.assertIn(b'WARC-Type: response', warc_file_content) self.assertIn(b'WARC-Concurrent-To: <urn:uuid:', warc_file_content) self.assertIn(b'Content-Type: application/http;msgtype=response', warc_file_content) self.assertIn( 'Wpull/{0}'.format(wpull.version.__version__).encode('utf-8'), warc_file_content ) self.assertIn( 'Python/{0}'.format( wpull.util.python_version()).encode('utf-8'), warc_file_content ) self.assertIn(b'Extra-Field: my_extra_field', warc_file_content) self.assertIn(b'GET / HTTP', warc_file_content) self.assertIn(b'KITTEH DOGE', warc_file_content) self.assertIn(b'FINISHED', warc_file_content) self.assertIn(b'WARC-Target-URI: urn:X-wpull:log', warc_file_content) self.assertIn(b'Content-Length:', warc_file_content) self.assertNotIn(b'Content-Length: 0', warc_file_content) cdx_lines = cdx_file_content.split(b'\n') cdx_labels = cdx_lines[0].strip().split(b' ') cdx_fields = cdx_lines[1].split(b' ') print(cdx_lines) self.assertEqual(3, len(cdx_lines)) self.assertEqual(10, len(cdx_labels)) self.assertEqual(9, len(cdx_fields)) self.assertTrue(cdx_lines[0].startswith(b' CDX')) self.assertEqual(b'http://example.com/', cdx_fields[0]) self.assertEqual(b'-', cdx_fields[2]) self.assertEqual(b'200', cdx_fields[3]) self.assertNotEqual(b'-', cdx_fields[4]) self.assertNotEqual(b'0', cdx_fields[5]) self.assertNotEqual(b'0', cdx_fields[6]) self.assertEqual( os.path.basename(warc_filename), cdx_fields[7].decode('ascii')) length = int(cdx_fields[5]) offset = int(cdx_fields[6]) with open(warc_filename, 'rb') as in_file: in_file.seek(offset) data = in_file.read(length) assert len(data) == length self.assertEqual(b'WARC/1.0', data[:8]) self.assertIn(b'KITTEH DOGE', data) self.validate_warc(warc_filename)