def test_html_soup(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({'http://example.com/ABOUTM~1.JPG'}, inline_urls) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls)
def test_rss_as_html(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) self.assertTrue(scrape_result) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(inline_urls) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls)
def test_sitemap_encoding(self): parser = HTMLParser() reader = SitemapReader(parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.startswith('utf_16') or name.startswith('utf_32'): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue data = io.BytesIO( bom_map.get(name, b'') + '<?xml version="1.0" encoding="UTF-8"?>' '<urlset><url><loc>blah</loc></url></urlset>'.encode(name) ) print('->', name) links = tuple(reader.iter_links(data, encoding=name)) link = links[0] self.assertEqual('blah', link)
def test_sitemap_scraper_xml(self): scraper = SitemapScraper(HTMLParser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <url> <loc>http://www.example.com/</loc> <lastmod>2005-01-01</lastmod> <changefreq>monthly</changefreq> <priority>0.8</priority> </url> </urlset> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/', }, linked_urls) self.assertFalse(inline_urls)
def test_sitemap_scraper_xml_index(self): scraper = SitemapScraper(HTMLParser()) request = Request('http://example.com/sitemap.xml') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'''<?xml version="1.0" encoding="UTF-8"?> <sitemapindex xmlns="http://www.sitemaps.org/schemas/sitemap/0.9"> <sitemap> <loc>http://www.example.com/sitemap1.xml.gz</loc> <lastmod>2004-10-01T18:23:17+00:00</lastmod> </sitemap> </sitemapindex> ''') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://www.example.com/sitemap1.xml.gz', }, linked_urls) self.assertFalse(inline_urls)
def test_html_scraper_links_base_href(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual( { 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls) self.assertEqual({'http://example.com/a/'}, linked_urls)
def test_html_encoding(self): html_parser = HTMLParser() reader = HTMLReader(html_parser) bom_map = { 'utf_16_le': codecs.BOM_UTF16_LE, 'utf_16_be': codecs.BOM_UTF16_BE, 'utf_32_le': codecs.BOM_UTF32_LE, 'utf_32_be': codecs.BOM_UTF32_BE, } for name in CODEC_NAMES: if name in EBCDIC or name == 'utf_8_sig': # XXX: we're assuming that all codecs are ASCII backward # compatable continue if name.startswith('utf_16') or name.startswith('utf_32'): # FIXME: libxml/lxml doesn't like it when we pass in a codec # name but don't specify the endian but BOM is included continue print('->', name) data = io.BytesIO(bom_map.get(name, b'') + '<img>'.encode(name)) elements = tuple(reader.iter_elements(data, encoding=name)) html_element = elements[0] self.assertEqual('html', html_element.tag)
def test_html_script_comment(self): test_string = b'''<script><!-- blah --></script>''' reader = HTMLReader(HTMLParser()) elements = reader.iter_elements(io.BytesIO(test_string), encoding='ascii') elements = tuple(elements) self.assertTrue( all(isinstance(element, Element) for element in elements))
def test_sitemap_scraper_invalid_robots(self): scraper = SitemapScraper(HTMLParser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write( b'dsfju3wrji kjasSItemapsdmjfkl wekie;er :Ads fkj3m /Dk') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertFalse(linked_urls) self.assertFalse(inline_urls)
def test_bad_xml(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'foxstripcomics_bad_xml.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) # No crash scraper.scrape(request, response, link_type=LinkType.html)
def test_sitemap_scraper_robots(self): scraper = SitemapScraper(HTMLParser()) request = Request('http://example.com/robots.txt') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): response.body.write(b'Sitemap: http://example.com/sitemap00.xml') scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual({ 'http://example.com/sitemap00.xml', }, linked_urls) self.assertFalse(inline_urls)
def test_html_encoding_lxml_name_mismatch(self): '''It should accept encoding names with underscore.''' element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=EUC_KR' with wpull.util.reset_file_offset(response.body): response.body.write('힖'.encode('euc_kr')) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) self.assertEqual('euc_kr', scrape_info['encoding'])
def test_html_scraper_reject_type(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response, link_type=LinkType.css) self.assertFalse(scrape_result)
def test_html_garbage(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html' with wpull.util.reset_file_offset(response.body): response.body.write( b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f' b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo' b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7') scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_parse_doctype(self): html_parser = HTMLParser() self.assertIn( 'html', html_parser.parse_doctype( io.BytesIO(b'<!DOCTYPE HTML><html></html>'))) self.assertIn( 'XHTML', html_parser.parse_doctype( io.BytesIO(b''' <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html></html> '''))) self.assertFalse(html_parser.parse_doctype( io.BytesIO(b'hello world!'))) self.assertFalse(html_parser.parse_doctype(io.BytesIO(b''))) self.assertFalse(html_parser.parse_doctype(io.BytesIO(b'\x00'))) self.assertFalse(html_parser.parse_doctype(io.BytesIO(b'A\xfe')))
def test_html_serious_bad_encoding(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker, encoding_override='utf8') request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=utf8' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'xkcd_1_evil.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_not_quite_charset(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'videogame_top.htm') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertIn('http://example.com/copyright_2001_2006_rtype.gif', inline_urls) self.assertIn('http://www.geocities.jp/gamehouse_grindcrusher/', linked_urls)
def test_html_wrong_charset(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'kcna.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-16-le', scrape_result.encoding) self.assertEqual( { 'http://example.com/utm/__utm.js', 'http://example.com/Knewskage.gif', 'http://example.com/Lline.gif', 'http://example.com/Sline.gif', 'http://example.com/korean01.gif', 'http://example.com/korean02.gif', 'http://example.com/english01.gif', 'http://example.com/english02.gif', 'http://example.com/Tongsinkage.gif', 'http://example.com/Knewskage.gif', }, inline_urls) self.assertEqual( { 'http://example.com/index-k.htm', 'http://example.com/index-e.htm', }, linked_urls)
def test_html_layout(self): html_parser = HTMLParser() reader = HTMLReader(html_parser) elements = tuple( reader.iter_elements(io.BytesIO(b''' <html> <head> <title>hi</title> </head> <body> <img> </body> </html>'''), encoding='ascii')) print(elements) self.assertEqual('html', elements[0].tag) self.assertEqual('head', elements[1].tag) self.assertEqual('title', elements[2].tag) self.assertEqual('body', elements[3].tag) self.assertEqual('img', elements[4].tag)
def test_html_krokozyabry(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() response.fields['content-type'] = 'text/html; charset=KOI8-R' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'krokozyabry.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('koi8-r', scrape_result.encoding) self.assertEqual(set(), inline_urls) self.assertEqual({'http://example.com/Кракозябры'}, linked_urls)
def test_xhtml_invalid(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, '') response.body = Body() with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'xhtml_invalid.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual( { 'http://example.com/image.png', 'http://example.com/script.js', }, inline_urls) self.assertEqual({'http://example.com/link'}, linked_urls)
def test_html_early_html(self): reader = HTMLReader(HTMLParser()) for test_string in [ b'''<!DOCTYPE HTML><html></html><img>''', b'''<html></html><img>''', b'''<!DOCTYPE HTML><img><html></html>''', b'''<img><html></html>''', b'''<!DOCTYPE HTML> <html><body></body></html><p><img>''', b''' <html><body></body></html><p><img>''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <html> <head> <title>Download</title> </head> <body> <br /> </body> </html> <pre><img></pre> ''', b''' <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd"> <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', b''' <html> <body> <br /> </body> <head> <title>Download</title> </head> </html> <pre><img></pre> ''', ]: elements = [] print() print('a' * 10) print(test_string) for element in reader.iter_elements(io.BytesIO(test_string), encoding='ascii'): if isinstance(element, Element): print(element) elements.append(element) element_tags = tuple(element.tag for element in elements) self.assertIn('img', element_tags)
def test_html_scraper_links(self): element_walker = ElementWalker(css_scraper=CSSScraper(), javascript_scraper=JavaScriptScraper()) scraper = HTMLScraper(HTMLParser(), element_walker) request = Request('http://example.com/') response = Response(200, 'OK') response.body = Body() response.fields['Refresh'] = '3; url=header_refresh.html' with wpull.util.reset_file_offset(response.body): html_file_path = os.path.join(ROOT_PATH, 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body) scrape_result = scraper.scrape(request, response) inline_urls = scrape_result.inline_links linked_urls = scrape_result.linked_links self.assertEqual('utf-8', scrape_result.encoding) self.assertEqual( frozenset({ 'http://example.com/style_import_url.css', 'http://example.com/style_import_quote_url.css', 'http://example.com/style_single_quote_import.css', 'http://example.com/style_double_quote_import.css', 'http://example.com/bg.png', 'http://example.com/link_href.css', 'http://example.com/script.js', 'http://example.com/body_background.png', 'http://example.com/images/table_background.png', 'http://example.com/images/td_background.png', 'http://example.com/images/th_background.png', 'http://example.com/style_url1.png', 'http://example.com/style_url2.png', 'http://example.com/applet/', # returned by lxml 'http://example.com/applet/applet_code.class', 'http://example.com/applet/applet_src.class', 'http://example.com/bgsound.mid', 'http://example.com/audio_src.wav', 'http://example.com/audio_poster.jpeg', 'http://example.net/source_src.wav', 'http://example.com/video_src.webm', 'http://example.com/video_poster.jpeg', 'http://example.net/track_src.vtt', 'http://example.net/source_src.webm', 'http://example.com/embed_src.mov', 'http://example.com/fig_src.png', 'http://example.com/frame_src.html', 'http://example.com/iframe_src.html', 'http://example.com/img_href.png', 'http://example.com/img_lowsrc.png', 'http://example.com/img_src.png', 'http://example.com/img_data.png', 'http://example.com/img_srcset_1.jpeg', 'http://example.com/img_srcset_2.jpeg', 'http://example.com/img_srcset_3.jpeg', 'http://example.com/input_src.png', 'http://example.com/layer_src.png', 'http://example.com/object/', # returned by lxml 'http://example.com/object/object_data.swf', 'http://example.com/object/object_archive.dat', 'mailto:internet', 'object_not_url_codebase', 'http://example.com/param_ref_value.php', 'http://example.com/overlay_src.html', 'http://example.com/script_variable.png', }), inline_urls) self.assertEqual( frozenset({ 'http://example.com/og_image.png', 'http://example.com/og_url.html', 'http://example.com/og_audio.mp3', 'http://example.com/og_video.webm', 'http://example.com/twitter_image.png', 'http://example.com/twitter_image0.png', 'http://example.com/twitter_image1.png', 'http://example.com/twitter_image2.png', 'http://example.com/twitter_image3.png', 'http://example.com/twitter_player.html', 'http://example.com/twitter_stream.mp4', 'http://example.net/soup.html', 'http://example.com/a_href.html', 'http://example.com/area_href.html', 'http://example.com/frame_src.html', 'http://example.com/embed_href.html', 'http://example.com/embed_src.mov', 'http://example.com/form_action.html', 'http://example.com/iframe_src.html', 'http://example.com/layer_src.png', 'http://example.com/overlay_src.html', 'ftp://ftp.protocol.invalid/', 'mailto:[email protected]', 'http://a-double-slash.example', 'http://example.com/header_refresh.html', 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable' if sys.version_info < (3, 5) else 'http://example.com/relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', 'http://example.com/a_javascript_link.html', 'http://example.com/a_onclick_link.html', }), linked_urls) for url in inline_urls | linked_urls: self.assertIsInstance(url, str)