コード例 #1
0
class InstagramQueryHashFinder(object):
    __CONTAINER_PATH_RE = {
        'tag': r'/static/bundles/metro/TagPageContainer\.js/.+?\.js',
        'profile': r'/static/bundles/metro/Consumer\.js/.+?\.js',
    }

    __QUERY_HASH_RE = r'queryId:"(.+?)"'

    def __init__(self, container_type: _InstagramQueryHashFinder_ContainerType) -> None:  # pragma: no cover
        self._container_re = re_compile(self.__class__.__CONTAINER_PATH_RE[container_type])
        self._query_hash_re = re_compile(self.__class__.__QUERY_HASH_RE)
        self._remote_fetcher = RemoteFetcher()

    def find_hashes(self) -> Set[str]:
        return self._get_from_container(self._container_re)

    def _get_from_container(self, re_container_path: Pattern[str]) -> Set[str]:
        page_doc, page_uri = self._remote_fetcher.get_string(INSTAGRAM_URL_ROOT)
        container_paths = re_container_path.search(page_doc)
        if container_paths:
            container_path = urljoin(page_uri, container_paths.group(0))
            return self._get_from_remote_js(container_path)
        raise InstagramError('container not found')

    def _get_from_remote_js(self, js_uri: str) -> Set[str]:
        try:
            js_src, _ = self._remote_fetcher.get_string(js_uri)
        except Exception:
            return set()
        return self._get_from_js(js_src)

    def _get_from_js(self, js: str) -> Set[str]:
        return set(self._query_hash_re.findall(js))
コード例 #2
0
ファイル: instagram.py プロジェクト: arboss/nichtparasoup
 def __init__(
     self, container_type: _InstagramQueryHashFinder_ContainerType
 ) -> None:  # pragma: no cover
     self._container_re = re_compile(
         self.__class__.__CONTAINER_PATH_RE[container_type])
     self._query_hash_re = re_compile(self.__class__.__QUERY_HASH_RE)
     self._remote_fetcher = RemoteFetcher()
コード例 #3
0
ファイル: reddit.py プロジェクト: weisk/nichtparasoup
 def __init__(self, **config: Any) -> None:  # pragma: no cover
     super().__init__(**config)
     self._uri_base = 'https://www.reddit.com/r/{}.json?after='.format(
         url_quote(self._config['subreddit']))
     self._after = None  # type: Optional[str]
     self._remote_fetcher = RemoteFetcher()
     self._image_recognizer = ImageRecognizer()
コード例 #4
0
ファイル: reddit.py プロジェクト: weisk/nichtparasoup
class Reddit(BaseImageCrawler):
    def __init__(self, **config: Any) -> None:  # pragma: no cover
        super().__init__(**config)
        self._uri_base = 'https://www.reddit.com/r/{}.json?after='.format(
            url_quote(self._config['subreddit']))
        self._after = None  # type: Optional[str]
        self._remote_fetcher = RemoteFetcher()
        self._image_recognizer = ImageRecognizer()

    @classmethod
    def info(cls) -> ImageCrawlerInfo:
        return ImageCrawlerInfo(
            description=
            'A Crawler for an arbitrary SubReddit of https://www.reddit.com',
            config=dict(subreddit='the SubReddit to crawl', ),
            icon_url=
            'https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png',
        )

    @classmethod
    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
        subreddit = config["subreddit"]
        if type(subreddit) is not str:
            raise TypeError("subreddit {!r} is not str".format(subreddit))
        if 0 == len(subreddit):
            raise ValueError("subreddit {!r} is empty".format(subreddit))
        return ImageCrawlerConfig(subreddit=subreddit, )

    def _reset(self) -> None:
        self._after = None

    def _crawl(self) -> ImageCollection:
        images = ImageCollection()
        listing_string, uri = self._remote_fetcher.get_string(
            self._get_uri(self._after))
        listing = json_loads(listing_string)
        del listing_string  # free up some ram
        for child in listing['data']['children']:
            image = self._get_image(child['data'])
            if image:
                images.add(
                    Image(
                        uri=image,
                        source=urljoin(uri, child['data']['permalink']),
                    ))
        # don't care if `after` is `None` after the crawl ... why not restarting at front when the end is reached?!
        self._after = listing['data']['after']
        return images

    def _get_uri(self, after: Optional[str]) -> str:
        return self._uri_base + (url_quote(after) if after else '')

    def _get_image(self, data: Dict[str, Any]) -> Optional[str]:
        uri = data.get('url')  # type: Optional[str]
        return uri if uri and self._image_recognizer.path_is_image(
            uri) else None
コード例 #5
0
 def __init__(self, **config: Any) -> None:  # pragma: no cover
     super().__init__(**config)
     self._amount = 10
     self._cursor = None  # type: Optional[str]
     self._remote_fetcher = RemoteFetcher()
コード例 #6
0
class BaseInstagramCrawler(BaseImageCrawler, ABC):

    def __init__(self, **config: Any) -> None:  # pragma: no cover
        super().__init__(**config)
        self._amount = 10
        self._cursor = None  # type: Optional[str]
        self._remote_fetcher = RemoteFetcher()

    def _reset(self) -> None:
        self._cursor = None

    def _crawl(self) -> ImageCollection:
        images = ImageCollection()
        query_uri = self._get_query_uri(
            self._get_query_hash(), self._amount, self._cursor, **self._get_query_variables())
        response = self._query(query_uri)
        for edge in response['edges']:
            images.update(self.__class__._get_images_from_media_edge_node(edge['node']))
            del edge
        page_info = response['page_info']  # type: Dict[str, Any]
        # don't care if this was the last page ... why not restarting at front when the end is reached?!
        self._cursor = page_info['end_cursor'] if page_info['has_next_page'] else None
        return images

    @classmethod
    def _get_images_from_media_edge_node(cls, node: Dict[str, Any]) -> ImageCollection:
        images = ImageCollection()
        if not node['is_video']:
            source = cls._get_post_url(node['shortcode'])
            images.add(Image(
                uri=node['display_url'],
                source=source,
                dimensions=node.get('dimensions'),
            ))
            for side_edge in node.get('edge_sidecar_to_children', dict(edges=[]))['edges']:
                if not side_edge['node']['is_video']:
                    images.add(Image(
                        uri=side_edge['node']['display_url'],
                        source=source,
                        dimensions=side_edge['node'].get('dimensions'),
                    ))
                del side_edge
        return images

    __URL_POST_TEMPLATE = INSTAGRAM_URL_ROOT + 'p/{}/'

    @classmethod
    def _get_post_url(cls, shortcode: str) -> str:
        return cls.__URL_POST_TEMPLATE.format(url_quote(shortcode))

    def _query(self, uri: str) -> Dict[str, Any]:
        response_string, uri = self._remote_fetcher.get_string(uri)
        # responses may be small in size but are memory hungry when parsing!
        response = json_loads(response_string)
        del response_string
        if response['status'] != 'ok':
            raise InstagramError('response not ok')
        try:
            return self._get_media_from_query_response(response)
        except KeyError as e:
            raise InstagramError('no media') from e

    @classmethod
    @abstractmethod
    def _get_media_from_query_response(cls, response: Dict[str, Any]) -> Dict[str, Any]:  # pragma: no cover
        """Get the path for media edges in query response

        example implementation:
            return response['data']['<aTYPE>']['edge_<bTYPE>_media']
        """
        raise NotImplementedError()

    __URL_QUERY = INSTAGRAM_URL_ROOT + 'graphql/query/'

    def _get_query_uri(self, query_hash: str, first: int, after: Optional[str], **variables: Any) -> str:
        return self.__class__.__URL_QUERY + '?' + urlencode(dict(
            query_hash=query_hash,
            variables=json_encode(dict(
                first=first,
                after=(after or ""),
                **variables
            ))
        ))

    @abstractmethod
    def _get_queryhashfinder(self) -> InstagramQueryHashFinder:  # pragma: no cover
        """
        get the query hash finder for this type of crawler

        example implementation:
            return InstagramQueryHashFinder('<type>')
        """
        raise NotImplementedError()

    def _check_query_hash(self, query_hash: str) -> bool:
        uri = self._get_query_uri(query_hash, 1, None, **self._get_query_variables())
        try:
            self._query(uri)
            return True
        except Exception:
            return False

    def _find_query_hash(self) -> Optional[str]:
        query_hashes = self._get_queryhashfinder().find_hashes()
        for query_hash in query_hashes:
            if self._check_query_hash(query_hash):
                return query_hash
        return None

    _QUERY_HASH_LOCK = Lock()  # global lock. may be overwritten in subclass
    _query_hash = None  # type: Optional[str]

    def _get_query_hash(self) -> str:
        cls = self.__class__
        # same class = same query_hash ... so lock and search ... others may use the same hash later
        with cls._QUERY_HASH_LOCK:
            if not cls._query_hash:
                query_hash = self._find_query_hash()
                if not query_hash:
                    raise InstagramError('did not find query hash')
                cls._query_hash = query_hash
        return cls._query_hash

    @abstractmethod
    def _get_query_variables(self) -> Dict[str, Any]:  # pragma: no cover
        """
        return the variables that are required for a query url

        example implementation:
            return dict(foo='bar')
        """
        raise NotImplementedError()
コード例 #7
0
 def __init__(self, **config: Any) -> None:  # pragma: no cover
     super().__init__(**config)
     self._older = None  # type: Optional[int]
     self._remote_fetcher = RemoteFetcher()
コード例 #8
0
class Pr0gramm(BaseImageCrawler):
    def __init__(self, **config: Any) -> None:  # pragma: no cover
        super().__init__(**config)
        self._older = None  # type: Optional[int]
        self._remote_fetcher = RemoteFetcher()

    @classmethod
    def info(cls) -> ImageCrawlerInfo:
        return ImageCrawlerInfo(
            description='A Crawler for https://pr0gramm.com',
            config=dict(
                promoted=
                'Boolean. Search only top("beliebt") voted content? Otherwise search anything("neu").',
                tags=
                'Filter. None, or a string that starts with "!" - see https://pr0gramm.com/new/2782197',
            ),
            icon_url='https://pr0gramm.com/media/pr0gramm-favicon.png',
        )

    @staticmethod
    def __check_config_tags(tags: Optional[str]) -> Optional[str]:
        if tags is None:
            return None
        if type(tags) is str:
            tags = tags.strip()
            if not tags.startswith('!'):
                raise ValueError('tags {!r} must start with "!"'.format(tags))
            if not len(tags) > 1:
                raise ValueError('tags {!r} is empty'.format(tags))
            return tags
        raise TypeError('tags {!r} is not str or None'.format(tags))

    @classmethod
    def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig:
        promoted = config[
            'promoted'] if 'promoted' in config else True  # type: bool
        if type(promoted) is not bool:
            raise TypeError('promoted {!r} is not bool'.format(promoted))
        tags = config['tags'] if 'tags' in config else None
        tags = cls.__check_config_tags(tags)
        return ImageCrawlerConfig(
            promoted=promoted,
            tags=tags,
        )

    __API_GET_URL = 'https://pr0gramm.com/api/items/get'

    @classmethod
    def _get_api_uri(cls,
                     *,
                     flags: int,
                     promoted: bool,
                     tags: Optional[str] = None,
                     older: Optional[int] = None) -> str:
        """
        :param flags: BitSet. sfw=1, nsfw=2, nsfl=4
        :param promoted: Search top("beliebt") only? - Otherwise search all("neu").
        :param tags: None, or a string that starts with "!" - see https://pr0gramm.com/new/2782197
        :param older: page through the search
        """
        params = dict(flags=str(flags),
                      promoted=('1' if promoted else '0'),
                      tags='!{} -"video"'.format(
                          '({})'.format(tags.lstrip('!')) if tags else ''))
        if older:
            params['older'] = str(older)
        return cls.__API_GET_URL + '?' + urlencode(params)

    def _reset(self) -> None:
        self._older = None

    __IMG_BASE_URL = 'https://img.pr0gramm.com/'
    __POST_BASE_URL = 'https://pr0gramm.com/new/'

    def _crawl(self) -> ImageCollection:
        images = ImageCollection()
        promoted = self._config['promoted']
        api_uri = self._get_api_uri(flags=1,
                                    promoted=promoted,
                                    tags=self._config.get('tags', None),
                                    older=self._older)
        response_raw, api_uri = self._remote_fetcher.get_string(api_uri)
        response = json_loads(response_raw)
        for item in response['items']:
            images.add(
                Image(
                    uri=urljoin(self.__IMG_BASE_URL, str(item['image'])),
                    source=urljoin(self.__POST_BASE_URL, str(item['id'])),
                    width=item.get('width'),
                    height=item.get('height'),
                ))
        if response['atEnd']:
            self.reset()
        else:
            self._older = response['items'][-1][
                'promoted' if promoted else 'id'] or None
        return images