def test_list_image_empty(self): url = 'http://lassie.it/amp/list_image_empty.html' l = Lassie() data = l.fetch(url) self.assertEqual(1, len(data['images']))
def test_str_image(self): url = 'http://lassie.it/amp/str_image.html' l = Lassie() data = l.fetch(url) self.assertEqual(1, len(data['images']))
def test_list_thumbnail_image(self): url = 'http://lassie.it/amp/list_thumbnail_image.html' l = Lassie() data = l.fetch(url) self.assertEqual(2, len(data['images']))
def test_list_json(self): url = 'http://lassie.it/amp/list_json.html' l = Lassie() data = l.fetch(url) self.assertTrue('Pixar' in data['description'])
def test_no_html_tag(self): url = 'http://lassie.it/core/no_html_tag.html' l = Lassie() data = l.fetch(url) self.assertTrue('no_html_tag' in data['title'])
def test_bad_url(self): url = 'http://lassie.it/youtube/bad_url_123456.json' l = Lassie() data = l.fetch(url) self.assertIsNone(data.get('oembed'))
def test_bad_json(self): url = 'http://lassie.it/amp/bad_json.html' l = Lassie() data = l.fetch(url) self.assertTrue('amp' in data['url'])
def test_video_objects(self): url = 'http://lassie.it/amp/video_objects.html' l = Lassie() data = l.fetch(url) self.assertEqual(1, len(data['videos']))
def test_merge_request_kwargs(self): l = Lassie() l.request_opts = { 'timeout': 3, } request_kwargs = l.merge_request_kwargs() self.assertTrue('timeout' in request_kwargs)
def test_request_opts_default_user_agent(self): l = Lassie() l.request_opts = { 'timeout': 3 } # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
def test_youtube_good(self): url = 'http://lassie.it/youtube/good.json' l = Lassie() data = l.fetch(url) self.assertEqual(len(data['videos']), 1) self.assertEqual(len(data['images']), 1)
def test_request_opts_no_headers(self): l = Lassie() l.request_opts = { 'headers': {}, 'timeout': 3 } # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers != {})
def test_core_class_setting_is_none(self): url = 'http://lassie.it/core/class_setting_is_none.html' # This is a really odd use-case where they'd set the class attr to None, but it might happen so oh wellz. l = Lassie() l.open_graph = None data = l.fetch(url, open_graph=False) self.assertEqual(len(data['images']), 0)
def test_core_class_setting_is_none(self): url = "http://lassie.it/core/class_setting_is_none.html" # This is a really odd use-case where they'd set the class attr to None, but it might happen so oh wellz. l = Lassie() l.open_graph = None data = l.fetch(url, open_graph=False) self.assertEqual(len(data["images"]), 0)
def test_all_properites(self): url = 'http://lassie.it/amp/all_properties.html' l = Lassie() data = l.fetch(url, all_images=True) self.assertEqual(len(data['images']), 3) title = 'Google Glass Is Dead, Long Live Snapchat Spectacles' self.assertEqual(data['title'], title)
def test_bad_image_dimensions(self): url = 'http://lassie.it/core/bad_image_dimensions.html' l = Lassie() data = l.fetch(url, all_images=True) # lassie.utils.convert_to_int will except a TypeError or ValueError and pass (not setting a width/height on the image) image = data['images'][0] self.assertTrue(not 'width' in image) self.assertTrue(not 'height' in image)
def test_bad_image_dimensions(self): url = "http://lassie.it/core/bad_image_dimensions.html" l = Lassie() data = l.fetch(url, all_images=True) # lassie.utils.convert_to_int will except a TypeError or ValueError and pass (not setting a width/height on the image) image = data["images"][0] self.assertTrue(not "width" in image) self.assertTrue(not "height" in image)
def test_bad_request_opts(self): l = Lassie() l.request_opts = { 'bad_key': True, 'headers': { 'User-Agent': 'lassie python' } } self.assertTrue('bad_key' not in l.request_opts) self.assertTrue('headers' in l.request_opts)
def test_core_retrieve_all_images(self): url = "http://lassie.it/core/retrieve_all_images.html" l = Lassie() l.all_images = True data = l.fetch(url) self.assertEqual(len(data["images"]), 3) last_image = data["images"][2] self.assertEqual(last_image["width"], 550) self.assertEqual(last_image["height"], 365)
def test_core_retrieve_all_images(self): url = 'http://lassie.it/core/retrieve_all_images.html' l = Lassie() l.all_images = True data = l.fetch(url) self.assertEqual(len(data['images']), 3) last_image = data['images'][2] self.assertEqual(last_image['width'], 550) self.assertEqual(last_image['height'], 365)
def test_core_class_vs_method_settings(self): url = "http://lassie.it/core/class_vs_method_settings.html" l = Lassie() data = l.fetch(url) self.assertEqual(len(data["images"]), 1) l.open_graph = False data = l.fetch(url) # open_graph is set to False so there shouldn't be any images in the list this time around self.assertEqual(len(data["images"]), 0)
def test_core_class_vs_method_settings(self): url = 'http://lassie.it/core/class_vs_method_settings.html' l = Lassie() data = l.fetch(url) self.assertEqual(len(data['images']), 1) l.open_graph = False data = l.fetch(url) # open_graph is set to False so there shouldn't be any images in the list this time around self.assertEqual(len(data['images']), 0)
def test_request_opts(self): l = Lassie() l.request_opts = { 'headers': { 'User-Agent': 'lassie python', }, 'timeout': 3 } self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts)) # If they modify one of the keys value, make sure it actually happened l.request_opts['headers'].update({'Content-Type': 'application/json'}) self.assertEqual(len(l.request_opts['headers']), 2) self.assertTrue(set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
def test_request_opts(self): l = Lassie() l.request_opts = { 'headers': { 'User-Agent': 'lassie python', }, 'timeout': 3 } self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts)) # If they modify one of the keys value, make sure it actually happened l.request_opts['headers'].update({'Content-Type': 'application/json'}) self.assertEqual(len(l.request_opts['headers']), 2) self.assertTrue( set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
def test_image_dimensions(self): url = "http://lassie.it/core/image_dimensions.html" l = Lassie() data = l.fetch(url, all_images=True) self.assertEqual(len(data["images"]), 4) image = data["images"][0] self.assertEqual(image["width"], 100) self.assertEqual(image["height"], 100) image = data["images"][1] self.assertEqual(image["width"], 100) self.assertEqual(image["height"], 100) image = data["images"][2] self.assertEqual(image["width"], 100) self.assertEqual(image["height"], 100) image = data["images"][3] self.assertEqual(image["width"], 100) self.assertEqual(image["height"], 100)
def test_image_dimensions(self): url = 'http://lassie.it/core/image_dimensions.html' l = Lassie() data = l.fetch(url, all_images=True) self.assertEqual(len(data['images']), 4) image = data['images'][0] self.assertEqual(image['width'], 100) self.assertEqual(image['height'], 100) image = data['images'][1] self.assertEqual(image['width'], 100) self.assertEqual(image['height'], 100) image = data['images'][2] self.assertEqual(image['width'], 100) self.assertEqual(image['height'], 100) image = data['images'][3] self.assertEqual(image['width'], 100) self.assertEqual(image['height'], 100)
def get_page_info(url: str, timeout: int = 4) -> Optional[PageInfo]: """Возвращает информацию о странице, расположенной по указанному адресу, либо None. :param url: :param timeout: Таймаут на подключение. """ if not url: return None lassie = Lassie() lassie.request_opts = {'timeout': timeout} try: result = lassie.fetch( url, touch_icon=False, favicon=False, ) except LassieError: # В LassieError заворачиваются исключения requests, # в т.ч.ошибки подключения, таймаут и пр. return None if result['status_code'] != 200: return None info = PageInfo( title=result.get('title', ''), description=result.get('description', ''), site_name=result.get('site_name', ''), images=result['images'], ) return info
def __init__( self, loglevel: int = DEFAULT_LOGLEVEL, sleep_time: int = DEFAULT_SLEEP_TIME, cache_dir: Optional[Union[str, Path]] = None, additional_extractors: Optional[List[Any]] = None, subtitle_language: str = DEFAULT_SUBTITLE_LANGUAGE, skip_subtitles: bool = False, ) -> None: """ Main interface to the library subtitle_language: for youtube subtitle requests skip_subtitles: don't attempt to download youtube subtitles sleep_time: time to wait between HTTP requests cache_dir: location the store cached data uses default user cache directory if not provided """ # handle cache dir cdir: Optional[Path] = None if cache_dir is not None: cdir = normalize_path(cache_dir) else: if "URL_METADATA_DIR" in os.environ: cdir = Path(os.environ["URL_METADATA_DIR"]) else: cdir = Path(user_data_dir("url_metadata")) if cdir.exists() and not cdir.is_dir(): raise RuntimeError( "'cache_dir' '{}' already exists but is not a directory". format(str(cdir))) if not cdir.exists(): cdir.mkdir() self._base_cache_dir: Path = cdir self.cache_dir: Path = self._base_cache_dir / "data" if not self.cache_dir.exists(): self.cache_dir.mkdir() self.metadata_cache = MetadataCache(self.cache_dir) # setup logging self.logger = setup_logger( name="url_metadata", level=loglevel, logfile=self.logpath, maxBytes=1e7, formatter=formatter( "{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s" ), ) self.skip_subtitles: bool = skip_subtitles self.subtitle_language: str = subtitle_language self.sleep_time: int = sleep_time ll: Lassie = Lassie() # hackery with a requests.Session to save the most recent request object ll.client = SaveSession(cb_func=self._save_http_response) self.lassie: Lassie = ll # default 'last response received' to None self._response: Optional[Response] = None # initialize site-specific parsers self.extractor_classes = EXTRACTORS if additional_extractors is not None: for ext in additional_extractors: if not issubclass(ext, AbstractSite): self.logger.warning( f"{ext} is not a subclass of AbstractSite") self.extractor_classes.append(ext) self.extractors: List[AbstractSite] = [ e(umc=self) for e in self.extractor_classes ]
import requests_cache from lassie import Lassie parser = argparse.ArgumentParser(description='Separates URLs with 200 status code from those without.') parser.add_argument('bmfile', help='Bookmarks file in JSON list format.') args = parser.parse_args() not_ok = [] bookmarks = [] requests_cache.configure('../cache/requests') user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36' headers = {'User-Agent': user_agent} l = Lassie() l.request_opts = {'headers': headers} webclient = requests.Session() webclient.headers.update(headers) with open(args.bmfile, 'r') as f: data = json.load(f) for i, b in enumerate(data['bookmarks']): url = b['url'] if not url or not url.startswith(('http', 'https')): continue print('#{}: {}'.format(i, url)) try:
def test_prepare_request(self): url = 'http://lassie.it/core/bad_keywords.html' l = Lassie() l._prepare_request('HEAD', url=url, headers=l.client.headers)
sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample) print("*" * 100) sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8', all_images=True) print(sample) pprint(sample) print("*" * 100) from lassie import Lassie l = Lassie() sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8') print(sample) pprint(sample) print("*" * 100) l.request_opts = { 'headers': { 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) ' 'Version/12.1.1 Safari/605.1.15 ' } }
def test_core_bad_keywords(self): url = 'http://lassie.it/core/bad_keywords.html' l = Lassie() data = l.fetch(url) self.assertEqual(data.get('keywords'), [])
def test_core_no_content_raises_error(self): url = 'http://lassie.it/core/empty.html' l = Lassie() self.assertRaises(LassieError, l.fetch, url)
def test_youtube_bad_html(self): url = 'http://lassie.it/youtube/bad_html.json' l = Lassie() data = l.fetch(url)
def __init__( self, *, cache_dir: Optional[Union[str, Path]] = None, loglevel: int = DEFAULT_LOGLEVEL, sleep_time: int = DEFAULT_SLEEP_TIME, additional_extractors: Optional[List[Any]] = None, file_parsers: Optional[List[FileParser]] = None, options: Optional[Options] = None, ) -> None: """ Main interface to the library sleep_time: time to wait between HTTP requests cache_dir: location the store cached data uses default user cache directory if not provided """ # handle cache dir cdir: Optional[Path] = None if cache_dir is not None: cdir = normalize_path(cache_dir) else: if "URL_CACHE_DIR" in os.environ: cdir = Path(os.environ["URL_CACHE_DIR"]) else: cdir = Path(user_data_dir("url_cache")) if cdir.exists() and not cdir.is_dir(): raise RuntimeError( "'cache_dir' '{}' already exists but is not a directory". format(str(cdir))) if not cdir.exists(): cdir.mkdir() self._base_cache_dir: Path = cdir self.cache_dir: Path = self._base_cache_dir / "data" if not self.cache_dir.exists(): self.cache_dir.mkdir() # setup logging self.logger: logging.Logger = setup_logger( name="url_cache", level=loglevel, logfile=self.logpath, maxBytes=1e7, formatter=formatter( "{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s" ), ) self.sleep_time = sleep_time self.options: Options = {} if options is None else options self._set_option_defaults() ll: Lassie = Lassie() # hackery with a requests.Session to save the most recent request object ll.client = SaveSession(cb_func=self._save_http_response) self.lassie: Lassie = ll # default 'last response received' to None self._response: Optional[Response] = None # initialize site-specific parsers self.extractor_classes = EXTRACTORS if additional_extractors is not None: for ext in additional_extractors: if not issubclass(ext, AbstractSite): self.logger.warning( f"{ext} is not a subclass of AbstractSite") self.extractor_classes.append(ext) self.extractors: List[AbstractSite] = [ e(uc=self) for e in self.extractor_classes ] # loop through each extractors file_parsers function # to append custom file parsers to the summary cache all_file_parsers = [] if file_parsers is None else file_parsers for ext in self.extractors: all_file_parsers.extend(ext.file_parsers()) self.summary_cache = SummaryDirCache(self.cache_dir, file_parsers=all_file_parsers)
def test_youtube_no_type(self): url = 'http://lassie.it/youtube/no_type.json' l = Lassie() data = l.fetch(url)
def lassie(): l = Lassie() l.request_opts = {'timeout': 3} return l
def test_request_opts_no_headers(self): l = Lassie() l.request_opts = {'headers': {}, 'timeout': 3} # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers != {})
def test_request_opts_default_user_agent(self): l = Lassie() l.request_opts = {'timeout': 3} # headers should be set to {} then User-Agent should be added self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)