class InstagramQueryHashFinder(object): __CONTAINER_PATH_RE = { 'tag': r'/static/bundles/metro/TagPageContainer\.js/.+?\.js', 'profile': r'/static/bundles/metro/Consumer\.js/.+?\.js', } __QUERY_HASH_RE = r'queryId:"(.+?)"' def __init__(self, container_type: _InstagramQueryHashFinder_ContainerType) -> None: # pragma: no cover self._container_re = re_compile(self.__class__.__CONTAINER_PATH_RE[container_type]) self._query_hash_re = re_compile(self.__class__.__QUERY_HASH_RE) self._remote_fetcher = RemoteFetcher() def find_hashes(self) -> Set[str]: return self._get_from_container(self._container_re) def _get_from_container(self, re_container_path: Pattern[str]) -> Set[str]: page_doc, page_uri = self._remote_fetcher.get_string(INSTAGRAM_URL_ROOT) container_paths = re_container_path.search(page_doc) if container_paths: container_path = urljoin(page_uri, container_paths.group(0)) return self._get_from_remote_js(container_path) raise InstagramError('container not found') def _get_from_remote_js(self, js_uri: str) -> Set[str]: try: js_src, _ = self._remote_fetcher.get_string(js_uri) except Exception: return set() return self._get_from_js(js_src) def _get_from_js(self, js: str) -> Set[str]: return set(self._query_hash_re.findall(js))
def __init__( self, container_type: _InstagramQueryHashFinder_ContainerType ) -> None: # pragma: no cover self._container_re = re_compile( self.__class__.__CONTAINER_PATH_RE[container_type]) self._query_hash_re = re_compile(self.__class__.__QUERY_HASH_RE) self._remote_fetcher = RemoteFetcher()
def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._uri_base = 'https://www.reddit.com/r/{}.json?after='.format( url_quote(self._config['subreddit'])) self._after = None # type: Optional[str] self._remote_fetcher = RemoteFetcher() self._image_recognizer = ImageRecognizer()
class Reddit(BaseImageCrawler): def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._uri_base = 'https://www.reddit.com/r/{}.json?after='.format( url_quote(self._config['subreddit'])) self._after = None # type: Optional[str] self._remote_fetcher = RemoteFetcher() self._image_recognizer = ImageRecognizer() @classmethod def info(cls) -> ImageCrawlerInfo: return ImageCrawlerInfo( description= 'A Crawler for an arbitrary SubReddit of https://www.reddit.com', config=dict(subreddit='the SubReddit to crawl', ), icon_url= 'https://www.redditstatic.com/desktop2x/img/favicon/apple-icon-120x120.png', ) @classmethod def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: subreddit = config["subreddit"] if type(subreddit) is not str: raise TypeError("subreddit {!r} is not str".format(subreddit)) if 0 == len(subreddit): raise ValueError("subreddit {!r} is empty".format(subreddit)) return ImageCrawlerConfig(subreddit=subreddit, ) def _reset(self) -> None: self._after = None def _crawl(self) -> ImageCollection: images = ImageCollection() listing_string, uri = self._remote_fetcher.get_string( self._get_uri(self._after)) listing = json_loads(listing_string) del listing_string # free up some ram for child in listing['data']['children']: image = self._get_image(child['data']) if image: images.add( Image( uri=image, source=urljoin(uri, child['data']['permalink']), )) # don't care if `after` is `None` after the crawl ... why not restarting at front when the end is reached?! self._after = listing['data']['after'] return images def _get_uri(self, after: Optional[str]) -> str: return self._uri_base + (url_quote(after) if after else '') def _get_image(self, data: Dict[str, Any]) -> Optional[str]: uri = data.get('url') # type: Optional[str] return uri if uri and self._image_recognizer.path_is_image( uri) else None
def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._amount = 10 self._cursor = None # type: Optional[str] self._remote_fetcher = RemoteFetcher()
class BaseInstagramCrawler(BaseImageCrawler, ABC): def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._amount = 10 self._cursor = None # type: Optional[str] self._remote_fetcher = RemoteFetcher() def _reset(self) -> None: self._cursor = None def _crawl(self) -> ImageCollection: images = ImageCollection() query_uri = self._get_query_uri( self._get_query_hash(), self._amount, self._cursor, **self._get_query_variables()) response = self._query(query_uri) for edge in response['edges']: images.update(self.__class__._get_images_from_media_edge_node(edge['node'])) del edge page_info = response['page_info'] # type: Dict[str, Any] # don't care if this was the last page ... why not restarting at front when the end is reached?! self._cursor = page_info['end_cursor'] if page_info['has_next_page'] else None return images @classmethod def _get_images_from_media_edge_node(cls, node: Dict[str, Any]) -> ImageCollection: images = ImageCollection() if not node['is_video']: source = cls._get_post_url(node['shortcode']) images.add(Image( uri=node['display_url'], source=source, dimensions=node.get('dimensions'), )) for side_edge in node.get('edge_sidecar_to_children', dict(edges=[]))['edges']: if not side_edge['node']['is_video']: images.add(Image( uri=side_edge['node']['display_url'], source=source, dimensions=side_edge['node'].get('dimensions'), )) del side_edge return images __URL_POST_TEMPLATE = INSTAGRAM_URL_ROOT + 'p/{}/' @classmethod def _get_post_url(cls, shortcode: str) -> str: return cls.__URL_POST_TEMPLATE.format(url_quote(shortcode)) def _query(self, uri: str) -> Dict[str, Any]: response_string, uri = self._remote_fetcher.get_string(uri) # responses may be small in size but are memory hungry when parsing! response = json_loads(response_string) del response_string if response['status'] != 'ok': raise InstagramError('response not ok') try: return self._get_media_from_query_response(response) except KeyError as e: raise InstagramError('no media') from e @classmethod @abstractmethod def _get_media_from_query_response(cls, response: Dict[str, Any]) -> Dict[str, Any]: # pragma: no cover """Get the path for media edges in query response example implementation: return response['data']['<aTYPE>']['edge_<bTYPE>_media'] """ raise NotImplementedError() __URL_QUERY = INSTAGRAM_URL_ROOT + 'graphql/query/' def _get_query_uri(self, query_hash: str, first: int, after: Optional[str], **variables: Any) -> str: return self.__class__.__URL_QUERY + '?' + urlencode(dict( query_hash=query_hash, variables=json_encode(dict( first=first, after=(after or ""), **variables )) )) @abstractmethod def _get_queryhashfinder(self) -> InstagramQueryHashFinder: # pragma: no cover """ get the query hash finder for this type of crawler example implementation: return InstagramQueryHashFinder('<type>') """ raise NotImplementedError() def _check_query_hash(self, query_hash: str) -> bool: uri = self._get_query_uri(query_hash, 1, None, **self._get_query_variables()) try: self._query(uri) return True except Exception: return False def _find_query_hash(self) -> Optional[str]: query_hashes = self._get_queryhashfinder().find_hashes() for query_hash in query_hashes: if self._check_query_hash(query_hash): return query_hash return None _QUERY_HASH_LOCK = Lock() # global lock. may be overwritten in subclass _query_hash = None # type: Optional[str] def _get_query_hash(self) -> str: cls = self.__class__ # same class = same query_hash ... so lock and search ... others may use the same hash later with cls._QUERY_HASH_LOCK: if not cls._query_hash: query_hash = self._find_query_hash() if not query_hash: raise InstagramError('did not find query hash') cls._query_hash = query_hash return cls._query_hash @abstractmethod def _get_query_variables(self) -> Dict[str, Any]: # pragma: no cover """ return the variables that are required for a query url example implementation: return dict(foo='bar') """ raise NotImplementedError()
def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._older = None # type: Optional[int] self._remote_fetcher = RemoteFetcher()
class Pr0gramm(BaseImageCrawler): def __init__(self, **config: Any) -> None: # pragma: no cover super().__init__(**config) self._older = None # type: Optional[int] self._remote_fetcher = RemoteFetcher() @classmethod def info(cls) -> ImageCrawlerInfo: return ImageCrawlerInfo( description='A Crawler for https://pr0gramm.com', config=dict( promoted= 'Boolean. Search only top("beliebt") voted content? Otherwise search anything("neu").', tags= 'Filter. None, or a string that starts with "!" - see https://pr0gramm.com/new/2782197', ), icon_url='https://pr0gramm.com/media/pr0gramm-favicon.png', ) @staticmethod def __check_config_tags(tags: Optional[str]) -> Optional[str]: if tags is None: return None if type(tags) is str: tags = tags.strip() if not tags.startswith('!'): raise ValueError('tags {!r} must start with "!"'.format(tags)) if not len(tags) > 1: raise ValueError('tags {!r} is empty'.format(tags)) return tags raise TypeError('tags {!r} is not str or None'.format(tags)) @classmethod def check_config(cls, config: Dict[Any, Any]) -> ImageCrawlerConfig: promoted = config[ 'promoted'] if 'promoted' in config else True # type: bool if type(promoted) is not bool: raise TypeError('promoted {!r} is not bool'.format(promoted)) tags = config['tags'] if 'tags' in config else None tags = cls.__check_config_tags(tags) return ImageCrawlerConfig( promoted=promoted, tags=tags, ) __API_GET_URL = 'https://pr0gramm.com/api/items/get' @classmethod def _get_api_uri(cls, *, flags: int, promoted: bool, tags: Optional[str] = None, older: Optional[int] = None) -> str: """ :param flags: BitSet. sfw=1, nsfw=2, nsfl=4 :param promoted: Search top("beliebt") only? - Otherwise search all("neu"). :param tags: None, or a string that starts with "!" - see https://pr0gramm.com/new/2782197 :param older: page through the search """ params = dict(flags=str(flags), promoted=('1' if promoted else '0'), tags='!{} -"video"'.format( '({})'.format(tags.lstrip('!')) if tags else '')) if older: params['older'] = str(older) return cls.__API_GET_URL + '?' + urlencode(params) def _reset(self) -> None: self._older = None __IMG_BASE_URL = 'https://img.pr0gramm.com/' __POST_BASE_URL = 'https://pr0gramm.com/new/' def _crawl(self) -> ImageCollection: images = ImageCollection() promoted = self._config['promoted'] api_uri = self._get_api_uri(flags=1, promoted=promoted, tags=self._config.get('tags', None), older=self._older) response_raw, api_uri = self._remote_fetcher.get_string(api_uri) response = json_loads(response_raw) for item in response['items']: images.add( Image( uri=urljoin(self.__IMG_BASE_URL, str(item['image'])), source=urljoin(self.__POST_BASE_URL, str(item['id'])), width=item.get('width'), height=item.get('height'), )) if response['atEnd']: self.reset() else: self._older = response['items'][-1][ 'promoted' if promoted else 'id'] or None return images