Example #1
0
    def test_list_image_empty(self):
        url = 'http://lassie.it/amp/list_image_empty.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(1, len(data['images']))
Example #2
0
    def test_str_image(self):
        url = 'http://lassie.it/amp/str_image.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(1, len(data['images']))
Example #3
0
    def test_list_thumbnail_image(self):
        url = 'http://lassie.it/amp/list_thumbnail_image.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(2, len(data['images']))
Example #4
0
    def test_list_json(self):
        url = 'http://lassie.it/amp/list_json.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertTrue('Pixar' in data['description'])
Example #5
0
    def test_no_html_tag(self):
        url = 'http://lassie.it/core/no_html_tag.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertTrue('no_html_tag' in data['title'])
Example #6
0
    def test_bad_url(self):
        url = 'http://lassie.it/youtube/bad_url_123456.json'

        l = Lassie()
        data = l.fetch(url)

        self.assertIsNone(data.get('oembed'))
Example #7
0
    def test_no_html_tag(self):
        url = 'http://lassie.it/core/no_html_tag.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertTrue('no_html_tag' in data['title'])
Example #8
0
    def test_bad_json(self):
        url = 'http://lassie.it/amp/bad_json.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertTrue('amp' in data['url'])
Example #9
0
    def test_video_objects(self):
        url = 'http://lassie.it/amp/video_objects.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(1, len(data['videos']))
Example #10
0
    def test_merge_request_kwargs(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3,
        }

        request_kwargs = l.merge_request_kwargs()
        self.assertTrue('timeout' in request_kwargs)
Example #11
0
    def test_request_opts_default_user_agent(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3
        }

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)
Example #12
0
    def test_merge_request_kwargs(self):
        l = Lassie()
        l.request_opts = {
            'timeout': 3,
        }

        request_kwargs = l.merge_request_kwargs()
        self.assertTrue('timeout' in request_kwargs)
Example #13
0
    def test_youtube_good(self):
        url = 'http://lassie.it/youtube/good.json'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(len(data['videos']), 1)
        self.assertEqual(len(data['images']), 1)
Example #14
0
    def test_request_opts_no_headers(self):
        l = Lassie()
        l.request_opts = {
            'headers': {},
            'timeout': 3
        }

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers != {})
Example #15
0
    def test_core_class_setting_is_none(self):
        url = 'http://lassie.it/core/class_setting_is_none.html'

        # This is a really odd use-case where they'd set the class attr to None, but it might happen so oh wellz.
        l = Lassie()
        l.open_graph = None
        data = l.fetch(url, open_graph=False)

        self.assertEqual(len(data['images']), 0)
Example #16
0
    def test_core_class_setting_is_none(self):
        url = "http://lassie.it/core/class_setting_is_none.html"

        # This is a really odd use-case where they'd set the class attr to None, but it might happen so oh wellz.
        l = Lassie()
        l.open_graph = None
        data = l.fetch(url, open_graph=False)

        self.assertEqual(len(data["images"]), 0)
Example #17
0
    def test_all_properites(self):
        url = 'http://lassie.it/amp/all_properties.html'

        l = Lassie()
        data = l.fetch(url, all_images=True)

        self.assertEqual(len(data['images']), 3)

        title = 'Google Glass Is Dead, Long Live Snapchat Spectacles'
        self.assertEqual(data['title'], title)
Example #18
0
    def test_bad_image_dimensions(self):
        url = 'http://lassie.it/core/bad_image_dimensions.html'

        l = Lassie()
        data = l.fetch(url, all_images=True)

        # lassie.utils.convert_to_int will except a TypeError or ValueError and pass (not setting a width/height on the image)
        image = data['images'][0]
        self.assertTrue(not 'width' in image)
        self.assertTrue(not 'height' in image)
Example #19
0
    def test_bad_image_dimensions(self):
        url = "http://lassie.it/core/bad_image_dimensions.html"

        l = Lassie()
        data = l.fetch(url, all_images=True)

        # lassie.utils.convert_to_int will except a TypeError or ValueError and pass (not setting a width/height on the image)
        image = data["images"][0]
        self.assertTrue(not "width" in image)
        self.assertTrue(not "height" in image)
Example #20
0
    def test_bad_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'bad_key': True,
            'headers': {
                'User-Agent': 'lassie python'
            }
        }

        self.assertTrue('bad_key' not in l.request_opts)
        self.assertTrue('headers' in l.request_opts)
Example #21
0
    def test_bad_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'bad_key': True,
            'headers': {
                'User-Agent': 'lassie python'
            }
        }

        self.assertTrue('bad_key' not in l.request_opts)
        self.assertTrue('headers' in l.request_opts)
Example #22
0
    def test_core_retrieve_all_images(self):
        url = "http://lassie.it/core/retrieve_all_images.html"

        l = Lassie()
        l.all_images = True

        data = l.fetch(url)
        self.assertEqual(len(data["images"]), 3)

        last_image = data["images"][2]
        self.assertEqual(last_image["width"], 550)
        self.assertEqual(last_image["height"], 365)
Example #23
0
    def test_core_retrieve_all_images(self):
        url = 'http://lassie.it/core/retrieve_all_images.html'

        l = Lassie()
        l.all_images = True

        data = l.fetch(url)
        self.assertEqual(len(data['images']), 3)

        last_image = data['images'][2]
        self.assertEqual(last_image['width'], 550)
        self.assertEqual(last_image['height'], 365)
Example #24
0
    def test_core_retrieve_all_images(self):
        url = 'http://lassie.it/core/retrieve_all_images.html'

        l = Lassie()
        l.all_images = True

        data = l.fetch(url)
        self.assertEqual(len(data['images']), 3)

        last_image = data['images'][2]
        self.assertEqual(last_image['width'], 550)
        self.assertEqual(last_image['height'], 365)
Example #25
0
    def test_core_class_vs_method_settings(self):
        url = "http://lassie.it/core/class_vs_method_settings.html"

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(len(data["images"]), 1)

        l.open_graph = False
        data = l.fetch(url)

        # open_graph is set to False so there shouldn't be any images in the list this time around
        self.assertEqual(len(data["images"]), 0)
Example #26
0
    def test_core_class_vs_method_settings(self):
        url = 'http://lassie.it/core/class_vs_method_settings.html'

        l = Lassie()
        data = l.fetch(url)

        self.assertEqual(len(data['images']), 1)

        l.open_graph = False
        data = l.fetch(url)

        # open_graph is set to False so there shouldn't be any images in the list this time around
        self.assertEqual(len(data['images']), 0)
Example #27
0
    def test_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'headers': {
                'User-Agent': 'lassie python',
            },
            'timeout': 3
        }

        self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts))

        # If they modify one of the keys value, make sure it actually happened
        l.request_opts['headers'].update({'Content-Type': 'application/json'})
        self.assertEqual(len(l.request_opts['headers']), 2)
        self.assertTrue(set(('User-Agent', 'Content-Type')).issubset(l.request_opts['headers']))
Example #28
0
    def test_request_opts(self):
        l = Lassie()
        l.request_opts = {
            'headers': {
                'User-Agent': 'lassie python',
            },
            'timeout': 3
        }

        self.assertTrue(set(('headers', 'timeout')).issubset(l.request_opts))

        # If they modify one of the keys value, make sure it actually happened
        l.request_opts['headers'].update({'Content-Type': 'application/json'})
        self.assertEqual(len(l.request_opts['headers']), 2)
        self.assertTrue(
            set(('User-Agent',
                 'Content-Type')).issubset(l.request_opts['headers']))
Example #29
0
    def test_image_dimensions(self):
        url = "http://lassie.it/core/image_dimensions.html"

        l = Lassie()
        data = l.fetch(url, all_images=True)

        self.assertEqual(len(data["images"]), 4)

        image = data["images"][0]
        self.assertEqual(image["width"], 100)
        self.assertEqual(image["height"], 100)

        image = data["images"][1]
        self.assertEqual(image["width"], 100)
        self.assertEqual(image["height"], 100)

        image = data["images"][2]
        self.assertEqual(image["width"], 100)
        self.assertEqual(image["height"], 100)

        image = data["images"][3]
        self.assertEqual(image["width"], 100)
        self.assertEqual(image["height"], 100)
Example #30
0
    def test_image_dimensions(self):
        url = 'http://lassie.it/core/image_dimensions.html'

        l = Lassie()
        data = l.fetch(url, all_images=True)

        self.assertEqual(len(data['images']), 4)

        image = data['images'][0]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][1]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][2]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][3]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)
Example #31
0
    def test_image_dimensions(self):
        url = 'http://lassie.it/core/image_dimensions.html'

        l = Lassie()
        data = l.fetch(url, all_images=True)

        self.assertEqual(len(data['images']), 4)

        image = data['images'][0]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][1]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][2]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)

        image = data['images'][3]
        self.assertEqual(image['width'], 100)
        self.assertEqual(image['height'], 100)
Example #32
0
def get_page_info(url: str, timeout: int = 4) -> Optional[PageInfo]:
    """Возвращает информацию о странице, расположенной
    по указанному адресу, либо None.

    :param url:
    :param timeout: Таймаут на подключение.

    """
    if not url:
        return None

    lassie = Lassie()
    lassie.request_opts = {'timeout': timeout}

    try:
        result = lassie.fetch(
            url,
            touch_icon=False,
            favicon=False,
        )

    except LassieError:
        # В LassieError заворачиваются исключения requests,
        # в т.ч.ошибки подключения, таймаут и пр.
        return None

    if result['status_code'] != 200:
        return None

    info = PageInfo(
        title=result.get('title', ''),
        description=result.get('description', ''),
        site_name=result.get('site_name', ''),
        images=result['images'],
    )

    return info
Example #33
0
    def __init__(
        self,
        loglevel: int = DEFAULT_LOGLEVEL,
        sleep_time: int = DEFAULT_SLEEP_TIME,
        cache_dir: Optional[Union[str, Path]] = None,
        additional_extractors: Optional[List[Any]] = None,
        subtitle_language: str = DEFAULT_SUBTITLE_LANGUAGE,
        skip_subtitles: bool = False,
    ) -> None:
        """
        Main interface to the library

        subtitle_language: for youtube subtitle requests
        skip_subtitles: don't attempt to download youtube subtitles
        sleep_time: time to wait between HTTP requests
        cache_dir: location the store cached data
                   uses default user cache directory if not provided
        """

        # handle cache dir
        cdir: Optional[Path] = None
        if cache_dir is not None:
            cdir = normalize_path(cache_dir)
        else:
            if "URL_METADATA_DIR" in os.environ:
                cdir = Path(os.environ["URL_METADATA_DIR"])
            else:
                cdir = Path(user_data_dir("url_metadata"))

        if cdir.exists() and not cdir.is_dir():
            raise RuntimeError(
                "'cache_dir' '{}' already exists but is not a directory".
                format(str(cdir)))
        if not cdir.exists():
            cdir.mkdir()
        self._base_cache_dir: Path = cdir

        self.cache_dir: Path = self._base_cache_dir / "data"
        if not self.cache_dir.exists():
            self.cache_dir.mkdir()
        self.metadata_cache = MetadataCache(self.cache_dir)

        # setup logging
        self.logger = setup_logger(
            name="url_metadata",
            level=loglevel,
            logfile=self.logpath,
            maxBytes=1e7,
            formatter=formatter(
                "{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s"
            ),
        )

        self.skip_subtitles: bool = skip_subtitles
        self.subtitle_language: str = subtitle_language
        self.sleep_time: int = sleep_time

        ll: Lassie = Lassie()
        # hackery with a requests.Session to save the most recent request object
        ll.client = SaveSession(cb_func=self._save_http_response)
        self.lassie: Lassie = ll

        # default 'last response received' to None
        self._response: Optional[Response] = None

        # initialize site-specific parsers
        self.extractor_classes = EXTRACTORS
        if additional_extractors is not None:
            for ext in additional_extractors:
                if not issubclass(ext, AbstractSite):
                    self.logger.warning(
                        f"{ext} is not a subclass of AbstractSite")
                self.extractor_classes.append(ext)

        self.extractors: List[AbstractSite] = [
            e(umc=self) for e in self.extractor_classes
        ]
Example #34
0
import requests_cache

from lassie import Lassie


parser = argparse.ArgumentParser(description='Separates URLs with 200 status code from those without.')
parser.add_argument('bmfile', help='Bookmarks file in JSON list format.')
args = parser.parse_args()

not_ok = []
bookmarks = []

requests_cache.configure('../cache/requests')
user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/43.0.2357.130 Chrome/43.0.2357.130 Safari/537.36'
headers = {'User-Agent': user_agent}
l = Lassie()
l.request_opts = {'headers': headers}
webclient = requests.Session()
webclient.headers.update(headers)


with open(args.bmfile, 'r') as f:
    data = json.load(f)

for i, b in enumerate(data['bookmarks']):
    url = b['url']
    if not url or not url.startswith(('http', 'https')):
        continue

    print('#{}: {}'.format(i, url))
    try:
Example #35
0
    def test_prepare_request(self):
        url = 'http://lassie.it/core/bad_keywords.html'

        l = Lassie()
        l._prepare_request('HEAD', url=url, headers=l.client.headers)
Example #36
0
sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8')
print(sample)
pprint(sample)

print("*" * 100)

sample = lassie.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8',
                      all_images=True)
print(sample)
pprint(sample)

print("*" * 100)

from lassie import Lassie

l = Lassie()
sample = l.fetch('https://www.youtube.com/watch?v=R6IT_f0XPT8')

print(sample)
pprint(sample)

print("*" * 100)

l.request_opts = {
    'headers': {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_5) AppleWebKit/605.1.15 (KHTML, like Gecko) '
        'Version/12.1.1 Safari/605.1.15 '
    }
}
Example #37
0
    def test_core_bad_keywords(self):
        url = 'http://lassie.it/core/bad_keywords.html'

        l = Lassie()
        data = l.fetch(url)
        self.assertEqual(data.get('keywords'), [])
Example #38
0
    def test_core_no_content_raises_error(self):
        url = 'http://lassie.it/core/empty.html'

        l = Lassie()
        self.assertRaises(LassieError, l.fetch, url)
Example #39
0
    def test_youtube_bad_html(self):
        url = 'http://lassie.it/youtube/bad_html.json'

        l = Lassie()
        data = l.fetch(url)
Example #40
0
    def __init__(
        self,
        *,
        cache_dir: Optional[Union[str, Path]] = None,
        loglevel: int = DEFAULT_LOGLEVEL,
        sleep_time: int = DEFAULT_SLEEP_TIME,
        additional_extractors: Optional[List[Any]] = None,
        file_parsers: Optional[List[FileParser]] = None,
        options: Optional[Options] = None,
    ) -> None:
        """
        Main interface to the library

        sleep_time: time to wait between HTTP requests
        cache_dir: location the store cached data
                   uses default user cache directory if not provided
        """

        # handle cache dir
        cdir: Optional[Path] = None
        if cache_dir is not None:
            cdir = normalize_path(cache_dir)
        else:
            if "URL_CACHE_DIR" in os.environ:
                cdir = Path(os.environ["URL_CACHE_DIR"])
            else:
                cdir = Path(user_data_dir("url_cache"))

        if cdir.exists() and not cdir.is_dir():
            raise RuntimeError(
                "'cache_dir' '{}' already exists but is not a directory".
                format(str(cdir)))
        if not cdir.exists():
            cdir.mkdir()
        self._base_cache_dir: Path = cdir

        self.cache_dir: Path = self._base_cache_dir / "data"
        if not self.cache_dir.exists():
            self.cache_dir.mkdir()

        # setup logging
        self.logger: logging.Logger = setup_logger(
            name="url_cache",
            level=loglevel,
            logfile=self.logpath,
            maxBytes=1e7,
            formatter=formatter(
                "{start}[%(levelname)-7s %(asctime)s %(name)s %(filename)s:%(lineno)d]{end} %(message)s"
            ),
        )

        self.sleep_time = sleep_time

        self.options: Options = {} if options is None else options
        self._set_option_defaults()

        ll: Lassie = Lassie()
        # hackery with a requests.Session to save the most recent request object
        ll.client = SaveSession(cb_func=self._save_http_response)
        self.lassie: Lassie = ll

        # default 'last response received' to None
        self._response: Optional[Response] = None

        # initialize site-specific parsers
        self.extractor_classes = EXTRACTORS
        if additional_extractors is not None:
            for ext in additional_extractors:
                if not issubclass(ext, AbstractSite):
                    self.logger.warning(
                        f"{ext} is not a subclass of AbstractSite")
                self.extractor_classes.append(ext)

        self.extractors: List[AbstractSite] = [
            e(uc=self) for e in self.extractor_classes
        ]

        # loop through each extractors file_parsers function
        # to append custom file parsers to the summary cache
        all_file_parsers = [] if file_parsers is None else file_parsers
        for ext in self.extractors:
            all_file_parsers.extend(ext.file_parsers())

        self.summary_cache = SummaryDirCache(self.cache_dir,
                                             file_parsers=all_file_parsers)
Example #41
0
    def test_youtube_no_type(self):
        url = 'http://lassie.it/youtube/no_type.json'

        l = Lassie()
        data = l.fetch(url)
Example #42
0
    def test_prepare_request(self):
        url = 'http://lassie.it/core/bad_keywords.html'

        l = Lassie()
        l._prepare_request('HEAD', url=url, headers=l.client.headers)
Example #43
0
def lassie():
    l = Lassie()
    l.request_opts = {'timeout': 3}
    return l
Example #44
0
    def test_core_bad_keywords(self):
        url = 'http://lassie.it/core/bad_keywords.html'

        l = Lassie()
        data = l.fetch(url)
        self.assertEqual(data.get('keywords'), [])
Example #45
0
    def test_request_opts_no_headers(self):
        l = Lassie()
        l.request_opts = {'headers': {}, 'timeout': 3}

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers != {})
Example #46
0
    def test_request_opts_default_user_agent(self):
        l = Lassie()
        l.request_opts = {'timeout': 3}

        # headers should be set to {} then User-Agent should be added
        self.assertTrue(l.client.headers['User-Agent'] == FAKE_USER_AGENT)