def test_css_scraper_krokozyabry(self): scraper = CSSScraper() request = Request.new('http://example.com/styles.css') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'krokozyabry.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual({ 'http://example.com/Кракозябры.png', }, inline_urls ) self.assertFalse(linked_urls)
def test_css_scraper_mojibake(self): scraper = CSSScraper() request = Request.new('http://example.com/styles.css') response = Response('HTTP/1.0', 200, 'OK') with wpull.util.reset_file_offset(response.body.content_file): html_file_path = os.path.join(os.path.dirname(__file__), 'testing', 'samples', 'mojibake.css') with open(html_file_path, 'rb') as in_file: shutil.copyfileobj(in_file, response.body.content_file) scrape_info = scraper.scrape(request, response) inline_urls = scrape_info['inline_urls'] linked_urls = scrape_info['linked_urls'] self.assertEqual({ 'http://example.com/文字化け.png', }, inline_urls ) self.assertFalse(linked_urls)
def test_scrape_css_urls(self): text = ''' @import url("fineprint.css") print; @import url("bluish.css") projection, tv; @import 'custom.css'; @import url("chrome://communicator/skin/"); @import "common.css" screen, projection; @import url('landscape.css') screen and (orientation:landscape); @import url(cool.css); @import warm.css; ''' urls = set(CSSScraper.scrape_urls(text)) self.assertEqual({ 'fineprint.css', 'bluish.css', 'chrome://communicator/skin/', 'landscape.css', 'cool.css' }, urls ) urls = set(CSSScraper.scrape_imports(text)) self.assertEqual({ 'fineprint.css', 'bluish.css', 'custom.css', 'chrome://communicator/skin/', 'common.css', 'landscape.css', 'cool.css', 'warm.css', }, urls )
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = url_record.filename if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported(file=in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported(file=in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') temp_filename = filename + '-new' if link_type == 'css': self._css_converter.convert(filename, temp_filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert(filename, temp_filename, base_url=url_record.url) else: raise Exception('Unknown link type.') os.remove(filename) os.rename(temp_filename, filename)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = url_record.filename if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported( file=in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported( file=in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info(__( _('Converting links in file ‘{filename}’ (type={type}).'), filename=filename, type=link_type )) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') temp_filename = filename + '-new' if link_type == 'css': self._css_converter.convert( filename, temp_filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert( filename, temp_filename, base_url=url_record.url) else: raise Exception('Unknown link type.') os.remove(filename) os.rename(temp_filename, filename)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = self._path_namer.get_filename(URLInfo.parse(url_record.url)) if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported(in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported(in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') if link_type == 'css': self._css_converter.convert(filename, filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert(filename, filename, base_url=url_record.url)
def convert_by_record(self, url_record): '''Convert using given URL Record.''' filename = self._path_namer.get_filename( URLInfo.parse(url_record.url) ) if not os.path.exists(filename): return if url_record.link_type: if url_record.link_type not in ('css', 'html'): return else: link_type = url_record.link_type else: with open(filename, 'rb') as in_file: if HTMLScraper.is_supported( in_file, url_info=url_record.url_info): link_type = 'html' elif CSSScraper.is_supported( in_file, url_info=url_record.url_info): link_type = 'css' else: link_type = None _logger.info( _('Converting links in file ‘{filename}’ (type={type}).')\ .format(filename=filename, type=link_type) ) if self._backup_enabled: shutil.copy2(filename, filename + '.orig') if link_type == 'css': self._css_converter.convert( filename, filename, base_url=url_record.url) elif link_type == 'html': self._html_converter.convert( filename, filename, base_url=url_record.url)