def requests_get(url, **kwargs): """Wraps :func:`requests.get` with extra semantics and our user agent. If a server tells us a response will be too big (based on Content-Length), we hijack the response and return 599 and an error response body instead. We pass stream=True to :func:`requests.get` so that it doesn't fetch the response body until we access :attr:`requests.Response.content` (or :attr:`requests.Response.text`). http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow """ if url in URL_BLACKLIST: resp = requests.Response() resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.' return resp kwargs.setdefault('headers', {}).update(request_headers(url=url)) resp = util.requests_get(url, stream=True, **kwargs) length = resp.headers.get('Content-Length', 0) if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE: resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE resp._text = resp._content = ('Content-Length %s is larger than our limit %s.' % (length, MAX_HTTP_RESPONSE_SIZE)) return resp
def upload_media(self, media): """Uploads one or more images or videos from web URLs. https://docs.joinmastodon.org/api/rest/media/ Args: media: sequence of AS image or stream objects, eg: [{'url': 'http://picture', 'displayName': 'a thing'}, ...] Returns: list of string media ids for uploaded files """ uploaded = set() # URLs uploaded so far; for de-duping ids = [] for obj in media: url = util.get_url(obj, key='stream') or util.get_url(obj) if not url or url in uploaded: continue data = {} alt = obj.get('displayName') if alt: data['description'] = util.ellipsize(alt, chars=MAX_ALT_LENGTH) # TODO: mime type check? with util.requests_get(url, stream=True) as fetch: fetch.raise_for_status() upload = self._post(API_MEDIA, files={'file': fetch.raw}) logging.info('Got: %s', upload) media_id = upload['id'] ids.append(media_id) uploaded.add(url) return ids
def requests_get(url, **kwargs): """Wraps requests.get with extra semantics and our user agent. If a server tells us a response will be too big (based on Content-Length), we hijack the response and return 599 and an error response body instead. We pass stream=True to requests.get so that it doesn't fetch the response body until we access response.content (or .text). http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow """ if url in URL_BLACKLIST: resp = requests.Response() resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.' return resp kwargs.setdefault('headers', {}).update(USER_AGENT_HEADER) resp = util.requests_get(url, stream=True, **kwargs) length = resp.headers.get('Content-Length', 0) if util.is_int(length) and int(length) > MAX_HTTP_RESPONSE_SIZE: resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE resp._text = resp._content = ( 'Content-Length %s is larger than our limit %s.' % (length, MAX_HTTP_RESPONSE_SIZE)) return resp
def _fetch(self, url): """Fetches url and returns (string final url, unicode body).""" try: resp = util.requests_get(url, stream=True) except (ValueError, requests.URLRequired, requests.TooManyRedirects) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. if url != resp.url: url = resp.url logging.info('Redirected to %s', url) body = resp.text length = resp.headers.get('Content-Length') if util.is_int(length): length = int(length) if not length: length = len(body) if length > MAX_HTTP_RESPONSE_SIZE: self.abort(HTTP_RESPONSE_TOO_BIG_STATUS_CODE, 'Content-Length %s for %s is larger than our limit of %s.' % (length, url, MAX_HTTP_RESPONSE_SIZE)) return url, body
def _fetch(self, url): """Fetches url and returns (string final url, unicode body).""" try: resp = util.requests_get(url, stream=True) except (ValueError, requests.URLRequired, requests.TooManyRedirects) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. if url != resp.url: url = resp.url logging.info('Redirected to %s', url) body = resp.text length = resp.headers.get('Content-Length') if util.is_int(length): length = int(length) if not length: length = len(body) if length > MAX_HTTP_RESPONSE_SIZE: self.abort( HTTP_RESPONSE_TOO_BIG_STATUS_CODE, 'Content-Length %s for %s is larger than our limit of %s.' % (length, url, MAX_HTTP_RESPONSE_SIZE)) return url, body
def rest(self, url, data=None, parse_json=True, **kwargs): """Makes a v3 REST API call. Uses HTTP POST if data is provided, otherwise GET. Args: data: dict, JSON payload for POST requests json: boolean, whether to parse the response body as JSON and return it as a dict. If False, returns a :class:`requests.Response` instead. Returns: dict decoded from JSON response if json=True, otherwise :class:`requests.Response` """ kwargs['headers'] = kwargs.get('headers') or {} kwargs['headers'].update({ 'Authorization': 'token %s' % self.access_token, # enable the beta Reactions API # https://developer.github.com/v3/reactions/ 'Accept': 'application/vnd.github.squirrel-girl-preview+json', }) if data is None: resp = util.requests_get(url, **kwargs) else: resp = util.requests_post(url, json=data, **kwargs) resp.raise_for_status() return json_loads(resp.text) if parse_json else resp
def _fetch(self, url): """Fetches url and returns (string final url, unicode body).""" try: resp = util.requests_get(url) except (ValueError, requests.URLRequired) as e: self.abort(400, str(e)) # other exceptions are handled by webutil.handlers.handle_exception(), # which uses interpret_http_exception(), etc. if url != resp.url: url = resp.url logging.info('Redirected to %s', url) body = resp.text return url, body
def requests_get(url, **kwargs): """Wraps :func:`requests.get` with extra semantics and our user agent. If a server tells us a response will be too big (based on Content-Length), we hijack the response and return 599 and an error response body instead. We pass stream=True to :func:`requests.get` so that it doesn't fetch the response body until we access :attr:`requests.Response.content` (or :attr:`requests.Response.text`). http://docs.python-requests.org/en/latest/user/advanced/#body-content-workflow """ if url in URL_BLACKLIST: resp = requests.Response() resp.status_code = HTTP_REQUEST_REFUSED_STATUS_CODE resp._text = resp._content = 'Sorry, Bridgy has blacklisted this URL.' return resp kwargs.setdefault('headers', {}).update(request_headers(url=url)) return util.requests_get(url, **kwargs)
def _scrape_json(url, cookie=None): """Fetches and returns JSON from www.instagram.com.""" headers = {} if cookie: if not cookie.startswith('sessionid='): cookie = 'sessionid=' + cookie headers = {'Cookie': cookie} resp = util.requests_get(url, allow_redirects=False, headers=headers) resp.raise_for_status() try: return resp.json() except ValueError as e: msg = "Couldn't decode response as JSON:\n%s" % resp.text logging.exception(msg) resp.status_code = 504 raise requests.HTTPError('504 Bad response from Instagram\n' + msg, response=resp)
def rest(self, url, data=None, **kwargs): """Makes a v3 REST API call. Uses HTTP POST if data is provided, otherwise GET. Args: data: dict, JSON payload for POST requests Returns: `requests.Response` """ kwargs['headers'] = kwargs.get('headers') or {} kwargs['headers'].update({ 'Authorization': 'token %s' % self.access_token, # enable the beta Reactions API # https://developer.github.com/v3/reactions/ 'Accept': 'application/vnd.github.squirrel-girl-preview+json', }) if data is None: resp = util.requests_get(url, **kwargs) else: resp = util.requests_post(url, json=data, **kwargs) resp.raise_for_status() return resp
def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url)
def fetch(url): return mf2py.parse(util.requests_get(url).text, url=url, img_with_alt=True)
def get(self): input = util.get_required_param(self, 'input') if input not in INPUTS: raise exc.HTTPBadRequest('Invalid input: %s, expected one of %r' % (input, INPUTS)) orig_url = util.get_required_param(self, 'url') fragment = urllib.parse.urlparse(orig_url).fragment if fragment and input != 'html': raise exc.HTTPBadRequest( 'URL fragments only supported with input=html.') resp = util.requests_get(orig_url, gateway=True) final_url = resp.url # decode data if input in ('activitystreams', 'as1', 'as2', 'mf2-json', 'json-mf2', 'jsonfeed'): try: body_json = json_loads(resp.text) body_items = (body_json if isinstance(body_json, list) else body_json.get('items') or [body_json]) except (TypeError, ValueError): raise exc.HTTPBadRequest('Could not decode %s as JSON' % final_url) mf2 = None if input == 'html': mf2 = util.parse_mf2(resp, id=fragment) if id and not mf2: raise exc.HTTPBadRequest( 'Got fragment %s but no element found with that id.' % fragment) elif input in ('mf2-json', 'json-mf2'): mf2 = body_json if not hasattr(mf2, 'get'): raise exc.HTTPBadRequest( 'Expected microformats2 JSON input to be dict, got %s' % mf2.__class__.__name__) mf2.setdefault('rels', {}) # mf2util expects rels actor = None title = None hfeed = None if mf2: def fetch_mf2_func(url): if util.domain_or_parent_in( urllib.parse.urlparse(url).netloc, SILO_DOMAINS): return { 'items': [{ 'type': ['h-card'], 'properties': { 'url': [url] } }] } return util.fetch_mf2(url, gateway=True) try: actor = microformats2.find_author( mf2, fetch_mf2_func=fetch_mf2_func) title = microformats2.get_title(mf2) hfeed = mf2util.find_first_entry(mf2, ['h-feed']) except (KeyError, ValueError) as e: raise exc.HTTPBadRequest('Could not parse %s as %s: %s' % (final_url, input, e)) try: if input in ('as1', 'activitystreams'): activities = body_items elif input == 'as2': activities = [as2.to_as1(obj) for obj in body_items] elif input == 'atom': try: activities = atom.atom_to_activities(resp.text) except ElementTree.ParseError as e: raise exc.HTTPBadRequest('Could not parse %s as XML: %s' % (final_url, e)) except ValueError as e: raise exc.HTTPBadRequest('Could not parse %s as Atom: %s' % (final_url, e)) elif input == 'html': activities = microformats2.html_to_activities(resp, url=final_url, id=fragment, actor=actor) elif input in ('mf2-json', 'json-mf2'): activities = [ microformats2.json_to_object(item, actor=actor) for item in mf2.get('items', []) ] elif input == 'jsonfeed': activities, actor = jsonfeed.jsonfeed_to_activities(body_json) except ValueError as e: logging.warning('parsing input failed', stack_info=True) self.abort( 400, 'Could not parse %s as %s: %s' % (final_url, input, str(e))) self.write_response( source.Source.make_activities_base_response(activities), url=final_url, actor=actor, title=title, hfeed=hfeed)
def _scrape(self, user_id=None, activity_id=None, cookie=None, fetch_extras=False, cache=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string fetch_extras: boolean cookie: string Returns: list of activities """ assert user_id or activity_id or cookie url = (HTML_MEDIA % self.id_to_shortcode(activity_id) if activity_id else self.user_url(user_id) if user_id else self.BASE_URL) kwargs = {} if cookie: kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in ('301', '302') and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = '401' raise requests.HTTPError('401 Unauthorized', response=resp) activities, actor = self.html_to_activities(resp.text) if fetch_extras and not activity_id: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): full_activity, _ = self.html_to_activities( util.requests_get(activity['url']).text) if full_activity: activities[i] = full_activity[0] cache_updates.update({ likes_key: likes, comments_key: comments }) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def _scrape(self, user_id=None, activity_id=None, cookie=None, fetch_extras=False, cache=None, shortcode=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string, e.g. '1020355224898358984_654594' fetch_extras: boolean cookie: string shortcode: string, e.g. '4pB6vEx87I' Returns: dict activities API response """ assert user_id or activity_id or shortcode or cookie assert not (activity_id and shortcode) if not shortcode: shortcode = self.id_to_shortcode(activity_id) url = (HTML_MEDIA % shortcode if shortcode else HTML_PROFILE % user_id if user_id else HTML_BASE_URL) kwargs = {} if cookie: kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in (301, 302) and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = 401 raise requests.HTTPError('401 Unauthorized', response=resp) elif resp.status_code == 404: if activity_id: return self._scrape(shortcode=activity_id, cookie=cookie) # otherwise not found, fall through and return empty response else: resp.raise_for_status() activities, actor = self.html_to_activities(resp.text) if fetch_extras and not activity_id: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL) resp = util.requests_get(url) resp.raise_for_status() full_activity, _ = self.html_to_activities(resp.text) if full_activity: activities[i] = full_activity[0] cache_updates.update({likes_key: likes, comments_key: comments}) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def _scrape(self, user_id=None, group_id=None, activity_id=None, cookie=None, count=None, fetch_extras=False, cache=None, shortcode=None): """Scrapes a user's profile or feed and converts the media to activities. Args: user_id: string activity_id: string, e.g. '1020355224898358984_654594' count: integer, number of activities to fetch and return, None for all fetch_extras: boolean cookie: string shortcode: string, e.g. '4pB6vEx87I' Returns: dict activities API response """ assert user_id or activity_id or shortcode or cookie assert not (activity_id and shortcode) if not shortcode: shortcode = self.id_to_shortcode(activity_id) url = ( HTML_MEDIA % shortcode if shortcode else HTML_PROFILE % user_id if user_id and group_id == source.SELF else HTML_BASE_URL) kwargs = {} if cookie: if not cookie.startswith('sessionid='): cookie = 'sessionid=' + cookie kwargs = {'headers': {'Cookie': cookie}} resp = util.requests_get(url, allow_redirects=False, **kwargs) if ((cookie and 'not-logged-in' in resp.text) or (resp.status_code in (301, 302) and '/accounts/login' in resp.headers.get('Location', ''))): resp.status_code = 401 raise requests.HTTPError('401 Unauthorized', response=resp) elif resp.status_code == 404: if activity_id: return self._scrape(shortcode=activity_id, cookie=cookie, count=count) # otherwise not found, fall through and return empty response else: resp.raise_for_status() activities, actor = self.html_to_activities(resp.text, cookie=cookie, count=count) if fetch_extras: # batch get cached counts of comments and likes for all activities cached = {} # don't update the cache until the end, in case we hit an error before cache_updates = {} if cache is not None: keys = [] for activity in activities: _, id = util.parse_tag_uri(activity['id']) keys.extend(['AIL ' + id, 'AIC ' + id]) cached = cache.get_multi(keys) for i, activity in enumerate(activities): obj = activity['object'] _, id = util.parse_tag_uri(activity['id']) likes = obj.get('ig_like_count') or 0 comments = obj.get('replies', {}).get('totalItems') or 0 likes_key = 'AIL %s' % id comments_key = 'AIC %s' % id if (likes and likes != cached.get(likes_key) or comments and comments != cached.get(comments_key)): if not activity_id and not shortcode: url = activity['url'].replace(self.BASE_URL, HTML_BASE_URL) resp = util.requests_get(url) resp.raise_for_status() # otherwise resp is a fetch of just this activity; reuse it full_activity, _ = self.html_to_activities( resp.text, cookie=cookie, count=count, fetch_extras=fetch_extras) if full_activity: activities[i] = full_activity[0] cache_updates.update({ likes_key: likes, comments_key: comments }) if cache_updates and cache is not None: cache.set_multi(cache_updates) resp = self.make_activities_base_response(activities) resp['actor'] = actor return resp
def html_to_activities(self, html, cookie=None): """Converts Instagram HTML to ActivityStreams activities. The input HTML may be from: * a user's feed, eg https://www.instagram.com/ while logged in * a user's profile, eg https://www.instagram.com/snarfed/ * a photo or video, eg https://www.instagram.com/p/BBWCSrfFZAk/ Args: html: unicode string cookie: string, optional sessionid cookie to be used for subsequent HTTP fetches, if necessary. Returns: tuple, ([ActivityStreams activities], ActivityStreams viewer actor) """ # extract JSON data blob # (can also get just this JSON by adding ?__a=1 to any IG URL.) script_start = '<script type="text/javascript">window._sharedData = ' start = html.find(script_start) if start == -1: # Instagram sometimes returns 200 with incomplete HTML. often it stops at # the end of one of the <style> tags inside <head>. not sure why. logging.warning('JSON script tag not found!') return [], None # App Engine's Python 2.7.5 json module doesn't support unpaired surrogate # Unicode chars, so it chokes on some JSON docs. Monkey patch in simplejson # to fix that. # https://code.google.com/p/googleappengine/issues/detail?id=12823 # http://stackoverflow.com/questions/15236742 try: import simplejson json_module = simplejson except ImportError: json_module = json start += len(script_start) end = html.find(';</script>', start) if end == -1: # as mentioned above, Instagram sometimes returns 200 with incomplete HTML logging.warning('JSON script close tag not found!') return [], None data = util.trim_nulls(json_module.loads(html[start:end])) entry_data = data.get('entry_data', {}) activities = [] # find media medias = [] profile_user = None # home page ie news feed for page in entry_data.get('FeedPage', []): edges = page.get('graphql', {}).get('user', {})\ .get('edge_web_feed_timeline', {}).get('edges', []) medias.extend( e.get('node') for e in edges if e.get('node', {}).get('__typename') not in ( 'GraphSuggestedUserFeedUnit', )) # profiles for page in entry_data.get('ProfilePage', []): profile_user = page.get('graphql', {}).get('user', {}) medias.extend(edge['node'] for edge in profile_user.get( 'edge_owner_to_timeline_media', {}).get('edges', []) if edge.get('node')) # individual photo/video permalinks for page in entry_data.get('PostPage', []): media = page.get('graphql', {}).get('shortcode_media') if media: medias.append(media) if not medias: # As of 2018-02-15, embedded JSON in logged in https://www.instagram.com/ # no longer has any useful data. Need to do a second header link fetch. soup = BeautifulSoup(html) link = soup.find('link', href=HTML_PRELOAD_RE) if link: url = urllib.parse.urljoin(HTML_BASE_URL, link['href']) headers = {'Cookie': cookie} if cookie else None resp = util.requests_get(url, allow_redirects=False, headers=headers) try: data = resp.json() except ValueError as e: msg = "Couldn't decode response as JSON:\n%s" % resp.text logging.exception(msg) resp.status_code = 504 raise requests.HTTPError( '504 Bad response from Instagram\n' + msg, response=resp) edges = data.get('data', {}).get('user', {})\ .get('edge_web_feed_timeline', {}).get('edges', []) medias = [e.get('node') for e in edges] for media in util.trim_nulls(medias): activities.append( self._json_media_node_to_activity(media, user=profile_user)) actor = None user = self._json_user_to_user( data.get('config', {}).get('viewer') or profile_user) if user: actor = self.user_to_actor(user) return activities, actor