def test_html_scraper_links_base_href(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'basehref.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('utf-8', scrape_info['encoding']) self.assertEqual({ 'http://cdn.example.com/stylesheet1.css', 'http://www.example.com/stylesheet2.css', 'http://example.com/a/stylesheet3.css', 'http://example.com/a/dir/image1.png', 'http://example.com/dir/image2.png', 'http://example.net/image3.png', 'http://example.com/dir/image4.png', }, inline_urls ) self.assertEqual({ 'http://example.com/a/' }, linked_urls )
def test_rss_as_html(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'application/rss+xml' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'rss.xml') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertFalse( inline_urls ) self.assertEqual( { 'http://www.someexamplerssdomain.com/main.html', 'http://www.wikipedia.org/' }, linked_urls )
def test_html_soup(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['Refresh'] = 'yes' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'soup.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual( {'http://example.com/ABOUTM~1.JPG'}, inline_urls ) self.assertEqual( { 'http://example.com/BLOG', 'http://example.com/web ring/Join.htm', }, linked_urls )
def test_xhtml_invalid(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'xhtml_invalid.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual( { 'http://example.com/image.png', 'http://example.com/script.js', }, inline_urls ) self.assertEqual( { 'http://example.com/link' }, linked_urls )
def test_html_krokozyabry(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'text/html; charset=KOI8-R' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'krokozyabry.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('koi8-r', scrape_info['encoding']) self.assertEqual( set(), inline_urls ) self.assertEqual( {'http://example.com/Кракозябры'}, linked_urls )
def test_javascript_heavy_inline_monstrosity(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'twitchplayspokemonfirered.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertIn( 'http://cdn.bulbagarden.net/upload/archive/a/a4/' '20090718115357%21195Quagsire.png', inline_urls ) self.assertIn( 'http://www.google.com/url?q=http%3A%2F%2Fwww.reddit.com%2F' 'user%2FGoldenSandslash15&sa=D&sntz=1&' 'usg=AFQjCNElFBxZYdNm5mWoRSncf5tbdIJQ-A', linked_urls )
def test_html_serious_bad_encoding(self): scraper = HTMLScraper(encoding_override='utf8') request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'text/html; charset=utf8' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'xkcd_1_evil.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_garbage(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'text/html' with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write( b'\x01\x00\x01\x00l~Z\xff\x0f`y\x80\x00p<\x7f' b'\xffndo\xff\xff-\x83{d\xec</\xfe\x80\x00\xb4Bo' b'\x7f\xff\xff\xffV\xc1\xff\x7f\xff7' ) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info)
def test_html_encoding_lxml_name_mismatch(self): '''It should accept encoding names with underscore.''' scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') response.fields['content-type'] = 'text/html; charset=EUC_KR' with wpull.util.reset_file_offset(response.body.content_file): response.body.content_file.write( '힖'.encode('euc_kr') ) scrape_info = scraper.scrape(request, response) self.assertTrue(scrape_info) self.assertEqual('euc_kr', scrape_info['encoding'])
def _read_input_file_as_html(self): '''Read input file as HTML and return the links.''' scrape_info = HTMLScraper.scrape_file( self._args.input_file, encoding=self._args.local_encoding or 'utf-8') links = itertools.chain(scrape_info['inline_urls'], scrape_info['linked_urls']) return links
def _read_input_file_as_html(self): '''Read input file as HTML and return the links.''' scrape_info = HTMLScraper.scrape_file( self._args.input_file, encoding=self._args.local_encoding or 'utf-8' ) links = itertools.chain( scrape_info['inline_urls'], scrape_info['linked_urls'] ) return links
def test_html_wrong_charset(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'kcna.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('utf-16-le', scrape_info['encoding']) self.assertEqual( { 'http://example.com/utm/__utm.js', 'http://example.com/Knewskage.gif', 'http://example.com/Lline.gif', 'http://example.com/Sline.gif', 'http://example.com/korean01.gif', 'http://example.com/korean02.gif', 'http://example.com/english01.gif', 'http://example.com/english02.gif', 'http://example.com/Tongsinkage.gif', 'http://example.com/Knewskage.gif', }, inline_urls ) self.assertEqual( { 'http://example.com/index-k.htm', 'http://example.com/index-e.htm', }, linked_urls )
def test_html_not_quite_charset(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, '') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'videogame_top.htm') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertIn( 'http://example.com/copyright_2001_2006_rtype.gif', inline_urls ) self.assertIn( 'http://www.geocities.jp/gamehouse_grindcrusher/', linked_urls )
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = url_record.filename if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported(file=in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported(file=in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') temp_filename = filename + '-new' if link_type == 'css': self._css_converter.convert(filename, temp_filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert(filename, temp_filename, base_url=url_record.url) else: raise Exception('Unknown link type.') os.remove(filename) os.rename(temp_filename, filename)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = url_record.filename if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported( file=in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported( file=in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info(__( _('Converting links in file ‘{filename}’ (type={type}).'), filename=filename, type=link_type )) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') temp_filename = filename + '-new' if link_type == 'css': self._css_converter.convert( filename, temp_filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert( filename, temp_filename, base_url=url_record.url) else: raise Exception('Unknown link type.') os.remove(filename) os.rename(temp_filename, filename)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = self._path_namer.get_filename(URLInfo.parse(url_record.url)) if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported(in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported(in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') if link_type == 'css': self._css_converter.convert(filename, filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert(filename, filename, base_url=url_record.url)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = self._path_namer.get_filename( URLInfo.parse(url_record.url) ) if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported( in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported( in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') if link_type == 'css': self._css_converter.convert( filename, filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert( filename, filename, base_url=url_record.url)
def test_html_scraper_links(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') response.fields['Refresh'] = '3; url=header_refresh.html' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('ascii', scrape_info['encoding']) self.assertEqual({ 'http://example.com/style_import_url.css', 'http://example.com/style_import_quote_url.css', 'http://example.com/style_single_quote_import.css', 'http://example.com/style_double_quote_import.css', 'http://example.com/link_href.css', 'http://example.com/script.js', 'http://example.com/body_background.png', 'http://example.com/images/table_background.png', 'http://example.com/images/td_background.png', 'http://example.com/images/th_background.png', 'http://example.com/style_url1.png', 'http://example.com/style_url2.png', 'http://example.com/applet/', # returned by lxml 'http://example.com/applet/applet_code.class', 'http://example.com/applet/applet_src.class', 'http://example.com/bgsound.mid', 'http://example.com/audio_src.wav', 'http://example.net/source_src.wav', 'http://example.com/embed_src.mov', 'http://example.com/fig_src.png', 'http://example.com/frame_src.html', 'http://example.com/iframe_src.html', 'http://example.com/img_href.png', 'http://example.com/img_lowsrc.png', 'http://example.com/img_src.png', 'http://example.com/input_src.png', 'http://example.com/layer_src.png', 'http://example.com/object/', # returned by lxml 'http://example.com/object/object_data.swf', 'http://example.com/object/object_archive.dat', 'http://example.com/param_ref_value.php', 'http://example.com/overlay_src.html', 'http://example.com/script_variable.png', }, inline_urls ) self.assertEqual({ 'http://example.net/soup.html', 'http://example.com/a_href.html', 'http://example.com/area_href.html', 'http://example.com/frame_src.html', 'http://example.com/embed_href.html', 'http://example.com/embed_src.mov', 'http://example.com/form_action.html', 'http://example.com/iframe_src.html', 'http://example.com/layer_src.png', 'http://example.com/overlay_src.html', 'ftp://ftp.protocol.invalid/', 'mailto:[email protected]', 'http://a-double-slash.example', 'http://example.com/header_refresh.html', 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', }, linked_urls ) for url in inline_urls | linked_urls: self.assertIsInstance(url, str)
def test_html_scraper_links(self): scraper = HTMLScraper() request = Request.new('http://example.com/') response = Response('HTTP/1.0', 200, 'OK') response.fields['Refresh'] = '3; url=header_refresh.html' with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'many_urls.html') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual('ascii', scrape_info['encoding']) self.assertEqual({ 'http://example.com/style_import_url.css', 'http://example.com/style_import_quote_url.css', 'http://example.com/style_single_quote_import.css', 'http://example.com/style_double_quote_import.css', 'http://example.com/link_href.css', 'http://example.com/script.js', 'http://example.com/body_background.png', 'http://example.com/images/table_background.png', 'http://example.com/images/td_background.png', 'http://example.com/images/th_background.png', 'http://example.com/style_url1.png', 'http://example.com/style_url2.png', 'http://example.com/applet/', # returned by lxml 'http://example.com/applet/applet_code.class', 'http://example.com/applet/applet_src.class', 'http://example.com/bgsound.mid', 'http://example.com/audio_src.wav', 'http://example.net/source_src.wav', 'http://example.com/embed_src.mov', 'http://example.com/fig_src.png', 'http://example.com/frame_src.html', 'http://example.com/iframe_src.html', 'http://example.com/img_href.png', 'http://example.com/img_lowsrc.png', 'http://example.com/img_src.png', 'http://example.com/img_data.png', 'http://example.com/input_src.png', 'http://example.com/layer_src.png', 'http://example.com/object/', # returned by lxml 'http://example.com/object/object_data.swf', 'http://example.com/object/object_archive.dat', 'http://example.com/param_ref_value.php', 'http://example.com/overlay_src.html', 'http://example.com/script_variable.png', }, inline_urls ) self.assertEqual({ 'http://example.net/soup.html', 'http://example.com/a_href.html', 'http://example.com/area_href.html', 'http://example.com/frame_src.html', 'http://example.com/embed_href.html', 'http://example.com/embed_src.mov', 'http://example.com/form_action.html', 'http://example.com/iframe_src.html', 'http://example.com/layer_src.png', 'http://example.com/overlay_src.html', 'ftp://ftp.protocol.invalid/', 'mailto:[email protected]', 'http://a-double-slash.example', 'http://example.com/header_refresh.html', 'https://[2001:db8:85a3:8d3:1319:8a2e:370:7348]:8080/ipv6', 'http://example.com/document_write.html', 'http://example.com/http_document_write.html', 'http://example.com/http_document_write2.html', 'http://example.com/http document write.html', 'http://example.com/script_variable.html', 'http://example.com/http_script_variable.html', 'https://example.com/https_script_variable.html', 'ftp://example.com/ftp_script_variable.html', 'http://example.com/end_dir_script_variable/', 'http://example.com/start_dir_script_variable', 'http://example.com/../relative_dir_script_variable', 'http://example.com/script_json.html', 'http://example.com/http_script_json.html?a=b', 'http://example.com/a_javascript_link.html', 'http://example.com/a_onclick_link.html', }, linked_urls ) for url in inline_urls | linked_urls: self.assertIsInstance(url, str)