def urlopen(self, url, **kwargs): """Wraps urllib2.urlopen() and passes through the access token. """ log_url = url if self.access_token: log_url = util.add_query_params(url, [('access_token', self.access_token[:4] + '...')]) url = util.add_query_params(url, [('access_token', self.access_token)]) logging.info('Fetching %s, kwargs %s', log_url, kwargs) return urllib2.urlopen(urllib2.Request(url, **kwargs), timeout=appengine_config.HTTP_TIMEOUT)
def urlopen(self, url, **kwargs): """Wraps urllib2.urlopen() and passes through the access token. """ log_url = url if self.access_token: log_url = util.add_query_params(url, [('access_token', self.access_token[:4] + '...')]) # TODO add access_token to the data parameter for POST requests url = util.add_query_params(url, [('access_token', self.access_token)]) logging.info('Fetching %s, kwargs %s', log_url, kwargs) resp = urllib2.urlopen(urllib2.Request(url, **kwargs), timeout=appengine_config.HTTP_TIMEOUT) return resp if kwargs.get('data') else json.loads(resp.read()).get('data')
def urlopen(self, relative_url, parse_response=True, **kwargs): """Wraps urllib2.urlopen() and passes through the access token. Returns: decoded JSON dict if parse_response is True, otherwise urlopen response object """ url = API_BASE + relative_url log_url = url if self.access_token: log_url = util.add_query_params(url, [('access_token', self.access_token[:4] + '...')]) url = util.add_query_params(url, [('access_token', self.access_token)]) logging.info('Fetching %s, kwargs %s', log_url, kwargs) resp = urllib2.urlopen(urllib2.Request(url, **kwargs), timeout=appengine_config.HTTP_TIMEOUT) return json.loads(resp.read()) if parse_response else resp
def fetch_replies(self, activities, min_id=None): """Fetches and injects Twitter replies into a list of activities, in place. Includes indirect replies ie reply chains, not just direct replies. Searches for @-mentions, matches them to the original tweets with in_reply_to_status_id_str, and recurses until it's walked the entire tree. Args: activities: list of activity dicts Returns: same activities list """ # cache searches for @-mentions for individual users. maps username to dict # mapping tweet id to ActivityStreams reply object dict. mentions = {} # find replies for activity in activities: # list of ActivityStreams reply object dict and set of seen activity ids # (tag URIs). seed with the original tweet; we'll filter it out later. replies = [activity] _, id = util.parse_tag_uri(activity['id']) seen_ids = set([id]) for reply in replies: # get mentions of this tweet's author so we can search them for replies to # this tweet. can't use statuses/mentions_timeline because i'd need to # auth as the user being mentioned. # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline # # note that these HTTP requests are synchronous. you can make async # requests by using urlfetch.fetch() directly, but not with urllib2. # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests author = reply['actor']['username'] if author not in mentions: url = API_SEARCH % { 'q': urllib.quote_plus('@' + author.encode('utf-8')), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) mentions[author] = self.urlopen(url)['statuses'] # look for replies. add any we find to the end of replies. this makes us # recursively follow reply chains to their end. (python supports # appending to a sequence while you're iterating over it.) for mention in mentions[author]: id = mention['id_str'] if (mention.get('in_reply_to_status_id_str') in seen_ids and id not in seen_ids): replies.append(self.tweet_to_activity(mention)) seen_ids.add(id) items = [r['object'] for r in replies[1:]] # filter out seed activity activity['object']['replies'] = { 'items': items, 'totalItems': len(items), }
def fetch_replies(self, activities, min_id=None): """Fetches and injects Twitter replies into a list of activities, in place. Includes indirect replies ie reply chains, not just direct replies. Searches for @-mentions, matches them to the original tweets with in_reply_to_status_id_str, and recurses until it's walked the entire tree. Args: activities: list of activity dicts Returns: same activities list """ # cache searches for @-mentions for individual users. maps username to dict # mapping tweet id to ActivityStreams reply object dict. mentions = {} # find replies for activity in activities: # list of ActivityStreams reply object dict and set of seen activity ids # (tag URIs). seed with the original tweet; we'll filter it out later. replies = [activity] _, id = util.parse_tag_uri(activity['id']) seen_ids = set([id]) for reply in replies: # get mentions of this tweet's author so we can search them for replies to # this tweet. can't use statuses/mentions_timeline because i'd need to # auth as the user being mentioned. # https://dev.twitter.com/docs/api/1.1/get/statuses/mentions_timeline # # note that these HTTP requests are synchronous. you can make async # requests by using urlfetch.fetch() directly, but not with urllib2. # https://developers.google.com/appengine/docs/python/urlfetch/asynchronousrequests author = reply['actor']['username'] if author not in mentions: url = API_SEARCH_URL % { 'q': urllib.quote_plus('@' + author), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) mentions[author] = self.urlopen(url)['statuses'] # look for replies. add any we find to the end of replies. this makes us # recursively follow reply chains to their end. (python supports # appending to a sequence while you're iterating over it.) for mention in mentions[author]: id = mention['id_str'] if (mention.get('in_reply_to_status_id_str') in seen_ids and id not in seen_ids): replies.append(self.tweet_to_activity(mention)) seen_ids.add(id) items = [r['object'] for r in replies[1:]] # filter out seed activity activity['object']['replies'] = { 'items': items, 'totalItems': len(items), }
def urlopen(self, url, **kwargs): """Wraps urllib2.urlopen() and passes through the access token.""" log_url = url if self.access_token: # TODO add access_token to the data parameter for POST requests url = util.add_query_params(url, [('access_token', self.access_token)]) resp = util.urlopen(urllib2.Request(url, **kwargs)) return resp if kwargs.get('data') else json.loads(resp.read()).get('data')
def urlopen(self, url, **kwargs): """Wraps :func:`urllib2.urlopen()` and passes through the access token.""" log_url = url if self.access_token: # TODO add access_token to the data parameter for POST requests url = util.add_query_params(url, [('access_token', self.access_token)]) resp = util.urlopen(urllib2.Request(url, **kwargs)) return (resp if kwargs.get('data') else source.load_json(resp.read(), url).get('data'))
def urlopen(self, url, **kwargs): """Wraps :func:`urllib2.urlopen()` and passes through the access token.""" if self.access_token: # TODO add access_token to the data parameter for POST requests url = util.add_query_params(url, [('access_token', self.access_token)]) resp = util.urlopen(urllib.request.Request(url, **kwargs)) return (resp if kwargs.get('data') else source.load_json( resp.read(), url).get('data'))
def fetch_mentions(self, username, min_id=None): """Fetches a user's @-mentions and returns them as ActivityStreams. Tries to only include explicit mentions, not mentions automatically created by @-replying. See the get_activities() docstring for details. Args: username: string min_id: only return activities with ids greater than this Returns: list of activity dicts """ # get mentions url = API_SEARCH_URL % { 'q': urllib.quote_plus('@' + username), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) candidates = self.urlopen(url)['statuses'] # fetch in-reply-to tweets (if any) in_reply_to_ids = util.trim_nulls( [c.get('in_reply_to_status_id_str') for c in candidates]) origs = {o.get('id_str'): o for o in self.urlopen(API_LOOKUP_URL % ','.join(in_reply_to_ids))} # filter out tweets that we don't consider mentions mentions = [] for c in candidates: if (c.get('user', {}).get('screen_name') == username or c.get('retweeted_status')): continue reply_to = origs.get(c.get('in_reply_to_status_id_str')) if not reply_to: mentions.append(c) else: reply_to_user = reply_to.get('user', {}).get('screen_name') mentioned = [u.get('screen_name') for u in reply_to.get('entities', {}).get('user_mentions', [])] if username != reply_to_user and username not in mentioned: mentions.append(c) return mentions
class Twitter(source.Source): """Implements the ActivityStreams API for Twitter. """ DOMAIN = 'twitter.com' NAME = 'Twitter' FRONT_PAGE_TEMPLATE = 'templates/twitter_index.html' # HTML snippet for embedding a tweet. # https://dev.twitter.com/docs/embedded-tweets EMBED_POST = """ <script async defer src="//platform.twitter.com/widgets.js" charset="utf-8"></script> <br /> <blockquote class="twitter-tweet" lang="en" data-dnt="true"> <p><a href="%(url)s">%(content)s</a></p> </blockquote> """ def __init__(self, access_token_key, access_token_secret): """Constructor. Twitter now requires authentication in v1.1 of their API. You can get an OAuth access token by creating an app here: https://dev.twitter.com/apps/new Args: access_token_key: string, OAuth access token key access_token_secret: string, OAuth access token secret """ self.access_token_key = access_token_key self.access_token_secret = access_token_secret def get_actor(self, screen_name=None): """Returns a user as a JSON ActivityStreams actor dict. Args: screen_name: string username. Defaults to the current user. """ if screen_name is None: url = API_CURRENT_USER_URL else: url = API_USER_URL % screen_name return self.user_to_actor(self.urlopen(url)) def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, search_query=None): """Fetches posts and converts them to ActivityStreams activities. XXX HACK: this is currently hacked for bridgy to NOT pass min_id to the request for fetching activity tweets themselves, but to pass it to all of the requests for filling in replies, retweets, etc. That's because we want to find new replies and retweets of older initial tweets. TODO: find a better way. See method docstring in source.py for details. app_id is ignored. min_id is translated to Twitter's since_id. The code for handling ETags (and 304 Not Changed responses and setting If-None-Match) is here, but unused right now since Twitter evidently doesn't support ETags. From https://dev.twitter.com/discussions/5800 : "I've confirmed with our team that we're not explicitly supporting this family of features." Likes (ie favorites) are scraped from twitter.com HTML, since Twitter's REST API doesn't offer a way to fetch them. You can also get them from the Streaming API, though, and convert them with streaming_event_to_object(). https://dev.twitter.com/docs/streaming-apis/messages#Events_event Shares (ie retweets) are fetched with a separate API call per tweet: https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid However, retweets are only fetched for the first 15 tweets that have them, since that's Twitter's rate limit per 15 minute window. :( https://dev.twitter.com/docs/rate-limiting/1.1/limits Use the group_id @self to retrieve a user_id’s timeline. If user_id is None or @me, it will return tweets for the current API user. group_id can be used to specify the slug of a list for which to return tweets. By default the current API user’s lists will be used, but lists owned by other users can be fetched by explicitly passing a username to user_id, e.g. to fetch tweets from the list @exampleuser/example-list you would call get_activities(user_id='exampleuser', group_id='example-list'). """ activities = [] if activity_id: tweets = [self.urlopen(API_STATUS_URL % activity_id)] total_count = len(tweets) else: if group_id == source.SELF: if user_id in (None, source.ME): url = API_SELF_TIMELINE_URL % (count + start_index) else: url = API_USER_TIMELINE_URL % { 'count': count + start_index, 'screen_name': user_id, } if fetch_likes: liked = self.urlopen(API_FAVORITES_URL % (user_id or '')) if liked: user = self.urlopen( API_USER_URL % user_id if user_id else API_CURRENT_USER_URL) activities += [ self._make_like(tweet, user) for tweet in liked ] elif group_id == source.SEARCH: url = API_SEARCH_URL % { 'q': urllib.quote_plus(search_query), 'count': count + start_index, } elif group_id in (None, source.FRIENDS, source.ALL): url = API_TIMELINE_URL % (count + start_index) else: url = API_LIST_TIMELINE_URL % { 'count': count + start_index, 'slug': group_id, 'owner_screen_name': user_id or self.get_actor().get('username') } headers = {'If-None-Match': etag} if etag else {} total_count = None try: resp = self.urlopen(url, headers=headers, parse_response=False) etag = resp.info().get('ETag') tweet_obj = json.loads(resp.read()) if group_id == source.SEARCH: tweet_obj = tweet_obj.get('statuses', []) tweets = tweet_obj[start_index:] except urllib2.HTTPError, e: if e.code == 304: # Not Modified, from a matching ETag tweets = [] else: raise # batch get memcached counts of favorites and retweets for all tweets cached = {} if cache is not None: keys = itertools.product(('ATR', 'ATF'), [t['id_str'] for t in tweets]) cached = cache.get_multi('%s %s' % (prefix, id) for prefix, id in keys) # only update the cache at the end, in case we hit an error before then cache_updates = {} if fetch_shares: retweet_calls = 0 for tweet in tweets: if tweet.get('retweeted'): # this tweet is itself a retweet continue elif retweet_calls >= RETWEET_LIMIT: logging.warning( "Hit Twitter's retweet rate limit (%d) with more to " "fetch! Results will be incomplete!" % RETWEET_LIMIT) break # store retweets in the 'retweets' field, which is handled by # tweet_to_activity(). # TODO: make these HTTP requests asynchronous. not easy since we don't # (yet) require threading support or use a non-blocking HTTP library. # # twitter limits this API endpoint to one call per minute per user, # which is easy to hit, so we stop before we hit that. # https://dev.twitter.com/docs/rate-limiting/1.1/limits # # can't use the statuses/retweets_of_me endpoint because it only # returns the original tweets, not the retweets or their authors. id = tweet['id_str'] count = tweet.get('retweet_count') if count and count != cached.get('ATR ' + id): url = API_RETWEETS_URL % id if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) tweet['retweets'] = self.urlopen(url) retweet_calls += 1 cache_updates['ATR ' + id] = count tweet_activities = [self.tweet_to_activity(t) for t in tweets] if fetch_replies: self.fetch_replies(tweet_activities, min_id=min_id) if fetch_likes: for tweet, activity in zip(tweets, tweet_activities): id = tweet['id_str'] count = tweet.get('favorite_count') if count and count != cached.get('ATF ' + id): url = HTML_FAVORITES_URL % id logging.debug('Fetching %s', url) try: html = json.loads( urllib2.urlopen(url, timeout=HTTP_TIMEOUT).read()).get( 'htmlUsers', '') except urllib2.URLError, e: util.interpret_http_exception(e) # just log it continue likes = self.favorites_html_to_likes(tweet, html) activity['object'].setdefault('tags', []).extend(likes) cache_updates['ATF ' + id] = count
def maybe_add_or_delete_source(self, source_cls, auth_entity, state, **kwargs): """Adds or deletes a source if auth_entity is not None. Used in each source's oauth-dropins :meth:`CallbackHandler.finish()` and :meth:`CallbackHandler.get()` methods, respectively. Args: source_cls: source class, e.g. :class:`instagram.Instagram` auth_entity: ouath-dropins auth entity state: string, OAuth callback state parameter. a JSON serialized dict with operation, feature, and an optional callback URL. For deletes, it will also include the source key kwargs: passed through to the source_cls constructor Returns: source entity if it was created or updated, otherwise None """ state_obj = util.decode_oauth_state(state) operation = state_obj.get('operation', 'add') feature = state_obj.get('feature') callback = state_obj.get('callback') user_url = state_obj.get('user_url') logging.debug( 'maybe_add_or_delete_source with operation=%s, feature=%s, callback=%s', operation, feature, callback) if operation == 'add': # this is an add/update if not auth_entity: if not self.messages: self.messages.add("OK, you're not signed up. Hope you reconsider!") if callback: callback = util.add_query_params(callback, {'result': 'declined'}) logging.debug( 'user declined adding source, redirect to external callback %s', callback) # call super.redirect so the callback url is unmodified super(Handler, self).redirect(callback.encode('utf-8')) else: self.redirect('/') return CachedPage.invalidate('/users') logging.info('%s.create_new with %s', source_cls.__class__.__name__, (auth_entity.key, state, kwargs)) source = source_cls.create_new(self, auth_entity=auth_entity, features=feature.split(',') if feature else [], user_url=user_url, **kwargs) if source: # add to login cookie logins = self.get_logins() logins.append(Login(path=source.bridgy_path(), site=source.SHORT_NAME, name=source.label_name())) self.set_logins(logins) if callback: callback = util.add_query_params(callback, { 'result': 'success', 'user': source.bridgy_url(self), 'key': source.key.urlsafe(), } if source else {'result': 'failure'}) logging.debug( 'finished adding source, redirect to external callback %s', callback) # call super.redirect so the callback url is unmodified super(Handler, self).redirect(callback.encode('utf-8')) elif source and not source.domains: self.redirect('/edit-websites?' + urllib.urlencode({ 'source_key': source.key.urlsafe(), })) else: self.redirect(source.bridgy_url(self) if source else '/') return source else: # this is a delete if auth_entity: self.redirect('/delete/finish?auth_entity=%s&state=%s' % (auth_entity.key.urlsafe(), state)) else: self.messages.add('If you want to disable, please approve the %s prompt.' % source_cls.GR_CLASS.NAME) source_key = state_obj.get('source') if source_key: source = ndb.Key(urlsafe=source_key).get() if source: return self.redirect(source.bridgy_url(self)) self.redirect('/')
resp = self.urlopen(id) if resp.get('error'): logging.warning("Couldn't fetch object %s: %s", id, resp) else: posts = [resp] break except urllib2.URLError, e: logging.warning("Couldn't fetch object %s: %s", id, e) else: posts = [] else: url = API_SELF_POSTS if group_id == source.SELF else API_HOME url = url % (user_id if user_id else 'me', start_index) if count: url = util.add_query_params(url, {'limit': count}) headers = {'If-None-Match': etag} if etag else {} try: resp = self.urlopen(url, headers=headers, parse_response=False) etag = resp.info().get('ETag') posts = json.loads(resp.read()).get('data', []) except urllib2.HTTPError, e: if e.code == 304: # Not Modified, from a matching ETag posts = [] else: raise activities = [self.post_to_activity(p) for p in posts] if fetch_shares: id_to_activity = {}
def maybe_add_or_delete_source(self, source_cls, auth_entity, state, **kwargs): """Adds or deletes a source if auth_entity is not None. Used in each source's oauth-dropins :meth:`CallbackHandler.finish()` and :meth:`CallbackHandler.get()` methods, respectively. Args: source_cls: source class, e.g. :class:`instagram.Instagram` auth_entity: ouath-dropins auth entity state: string, OAuth callback state parameter. a JSON serialized dict with operation, feature, and an optional callback URL. For deletes, it will also include the source key kwargs: passed through to the source_cls constructor Returns: source entity if it was created or updated, otherwise None """ state_obj = util.decode_oauth_state(state) operation = state_obj.get('operation', 'add') feature = state_obj.get('feature') callback = state_obj.get('callback') user_url = state_obj.get('user_url') logging.debug( 'maybe_add_or_delete_source with operation=%s, feature=%s, callback=%s', operation, feature, callback) if operation == 'add': # this is an add/update if not auth_entity: if not self.messages: self.messages.add("OK, you're not signed up. Hope you reconsider!") if callback: callback = util.add_query_params(callback, {'result': 'declined'}) logging.debug( 'user declined adding source, redirect to external callback %s', callback) # call super.redirect so the callback url is unmodified super(Handler, self).redirect(callback) else: self.redirect('/') return CachedPage.invalidate('/users') logging.info('%s.create_new with %s', source_cls.__class__.__name__, (auth_entity.key, state, kwargs)) source = source_cls.create_new(self, auth_entity=auth_entity, features=feature.split(',') if feature else [], user_url=user_url, **kwargs) if source: # add to login cookie logins = self.get_logins() logins.append(Login(path=source.bridgy_path(), site=source.SHORT_NAME, name=source.label_name())) self.set_logins(logins) if callback: callback = util.add_query_params(callback, { 'result': 'success', 'user': source.bridgy_url(self), 'key': source.key.urlsafe().decode(), } if source else {'result': 'failure'}) logging.debug( 'finished adding source, redirect to external callback %s', callback) # call super.redirect so the callback url is unmodified super(Handler, self).redirect(callback) elif source and not source.domains: self.redirect('/edit-websites?' + urllib.parse.urlencode({ 'source_key': source.key.urlsafe().decode(), })) else: self.redirect(source.bridgy_url(self) if source else '/') return source else: # this is a delete if auth_entity: self.redirect('/delete/finish?auth_entity=%s&state=%s' % (auth_entity.key.urlsafe().decode(), state)) else: self.messages.add('If you want to disable, please approve the %s prompt.' % source_cls.GR_CLASS.NAME) source_key = state_obj.get('source') if source_key: source = ndb.Key(urlsafe=source_key).get() if source: return self.redirect(source.bridgy_url(self)) self.redirect('/')
def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, fetch_mentions=False, search_query=None, scrape=False, cookie=None, **kwargs): """Fetches posts and converts them to ActivityStreams activities. See method docstring in source.py for details. app_id is ignored. Supports min_id, but not ETag, since Instagram doesn't support it. http://instagram.com/developer/endpoints/users/#get_users_feed http://instagram.com/developer/endpoints/users/#get_users_media_recent Likes are always included, regardless of the fetch_likes kwarg. They come bundled in the 'likes' field of the API Media object: http://instagram.com/developer/endpoints/media/# Mentions are never fetched or included because the API doesn't support searching for them. https://github.com/snarfed/bridgy/issues/523#issuecomment-155523875 Shares are never fetched included since there is no share feature. Instagram only supports search over hashtags, so if search_query is set, it must begin with #. May populate a custom 'ig_like_count' property in media objects. (Currently only when scraping.) Args: scrape: if True, scrapes HTML from instagram.com instead of using the API. Populates the user's actor object in the 'actor' response field. Useful for apps that haven't yet been approved in the new permissions approval process. Currently only supports group_id=SELF. Also supports passing a shortcode as activity_id as well as the internal API id. http://developers.instagram.com/post/133424514006/instagram-platform-update cookie: string, only used if scrape=True **: see :meth:`Source.get_activities_reponse` Raises: InstagramAPIError """ if scrape or self.scrape: if not (activity_id or (group_id == source.SELF and user_id) or (group_id == source.FRIENDS and cookie)): raise NotImplementedError( 'Scraping only supports activity_id, user_id and group_id=@self, or cookie and group_id=@friends.') return self._scrape(user_id=user_id, activity_id=activity_id, cookie=cookie, fetch_extras=fetch_replies or fetch_likes, cache=cache) if user_id is None: user_id = 'self' if group_id is None: group_id = source.FRIENDS if search_query: if search_query.startswith('#'): search_query = search_query[1:] else: raise NotImplementedError( 'Instagram only supports search over hashtags, so search_query must ' 'begin with the # character.') # TODO: paging media = [] kwargs = {} if min_id is not None: kwargs['min_id'] = min_id activities = [] try: media_url = (API_MEDIA_URL % activity_id if activity_id else API_USER_MEDIA_URL % user_id if group_id == source.SELF else API_MEDIA_POPULAR_URL if group_id == source.ALL else API_MEDIA_SEARCH_URL % search_query if group_id == source.SEARCH else API_USER_FEED_URL if group_id == source.FRIENDS else None) assert media_url media = self.urlopen(util.add_query_params(media_url, kwargs)) if media: if activity_id: media = [media] activities += [self.media_to_activity(m) for m in util.trim_nulls(media)] if group_id == source.SELF and fetch_likes: # add the user's own likes liked = self.urlopen( util.add_query_params(API_USER_LIKES_URL % user_id, kwargs)) if liked: user = self.urlopen(API_USER_URL % user_id) activities += [self.like_to_object(user, l['id'], l['link']) for l in liked] except urllib2.HTTPError, e: code, body = util.interpret_http_exception(e) # instagram api should give us back a json block describing the # error. but if it's an error for some other reason, it probably won't # be properly formatted json. try: body_obj = json.loads(body) if body else {} except ValueError: body_obj = {} if body_obj.get('meta', {}).get('error_type') == 'APINotFoundError': logging.exception(body_obj.get('meta', {}).get('error_message')) else: raise e
try: posts = [json.loads(self.urlopen(API_OBJECT_URL % id).read())] break except urllib2.URLError, e: logging.warning("Couldn't fetch object %s: %s", id, e) else: posts = [] if posts == [False]: # FB returns false for "not found" posts = [] else: url = API_SELF_POSTS_URL if group_id == source.SELF else API_HOME_URL url = url % (user_id if user_id else 'me', start_index) if count: url = util.add_query_params(url, {'limit': count}) headers = {'If-None-Match': etag} if etag else {} try: resp = self.urlopen(url, headers=headers) etag = resp.info().get('ETag') posts = json.loads(resp.read()).get('data', []) except urllib2.HTTPError, e: if e.code == 304: # Not Modified, from a matching ETag posts = [] else: raise activities = [self.post_to_activity(p) for p in posts] response = self._make_activities_base_response(activities) response['etag'] = etag return response
def fetch_mentions(self, username, tweets, min_id=None): """Fetches a user's @-mentions and returns them as ActivityStreams. Tries to only include explicit mentions, not mentions automatically created by @-replying. See the get_activities() docstring for details. Args: username: string tweets: list of Twitter API objects. used to find quote tweets quoting them. min_id: only return activities with ids greater than this Returns: list of activity dicts """ # get @-name mentions url = API_SEARCH % { 'q': urllib.quote_plus('@' + username.encode('utf-8')), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) candidates = self.urlopen(url)['statuses'] # fetch in-reply-to tweets (if any) in_reply_to_ids = util.trim_nulls( [c.get('in_reply_to_status_id_str') for c in candidates]) origs = { o.get('id_str'): o for o in self.urlopen(API_LOOKUP % ','.join(in_reply_to_ids)) } if in_reply_to_ids else {} # filter out tweets that we don't consider mentions mentions = [] for c in candidates: if (c.get('user', {}).get('screen_name') == username or c.get('retweeted_status')): continue reply_to = origs.get(c.get('in_reply_to_status_id_str')) if not reply_to: mentions.append(c) else: reply_to_user = reply_to.get('user', {}).get('screen_name') mentioned = [u.get('screen_name') for u in reply_to.get('entities', {}).get('user_mentions', [])] if username != reply_to_user and username not in mentioned: mentions.append(c) # search for quote tweets # Guideline ("Limit your searches to 10 keywords and operators.") # implies fewer, but 20 IDs seems to work in practice. # https://dev.twitter.com/rest/public/search for batch in [ tweets[i:i + QUOTE_SEARCH_BATCH_SIZE] for i in xrange(0, len(tweets), QUOTE_SEARCH_BATCH_SIZE) ]: batch_ids = [t['id_str'] for t in batch] url = API_SEARCH % { 'q': urllib.quote_plus(' OR '.join(batch_ids)), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) candidates = self.urlopen(url)['statuses'] for c in candidates: quoted_status_id = c.get('quoted_status_id_str') if (quoted_status_id and quoted_status_id in batch_ids and not c.get('retweeted_status')): mentions.append(c) return mentions
def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, fetch_mentions=False, search_query=None): """Fetches posts and converts them to ActivityStreams activities. See method docstring in source.py for details. app_id is ignored. Supports min_id, but not ETag, since Instagram doesn't support it. http://instagram.com/developer/endpoints/users/#get_users_feed http://instagram.com/developer/endpoints/users/#get_users_media_recent Likes are always included, regardless of the fetch_likes kwarg. They come bundled in the 'likes' field of the API Media object: http://instagram.com/developer/endpoints/media/# Mentions are never fetched or included because the API doesn't support searching for them. https://github.com/snarfed/bridgy/issues/523#issuecomment-155523875 Shares are never fetched included since there is no share feature. Instagram only supports search over hashtags, so if search_query is set, it must begin with #. Raises: InstagramAPIError """ if user_id is None: user_id = 'self' if group_id is None: group_id = source.FRIENDS if search_query: if search_query.startswith('#'): search_query = search_query[1:] else: raise NotImplementedError( 'Instagram only supports search over hashtags, so search_query must ' 'begin with the # character.') # TODO: paging media = [] kwargs = {} if min_id is not None: kwargs['min_id'] = min_id activities = [] try: media_url = (API_MEDIA_URL % activity_id if activity_id else API_USER_MEDIA_URL % user_id if group_id == source.SELF else API_MEDIA_POPULAR_URL if group_id == source.ALL else API_MEDIA_SEARCH_URL % search_query if group_id == source.SEARCH else API_USER_FEED_URL if group_id == source.FRIENDS else None) assert media_url media = self.urlopen(util.add_query_params(media_url, kwargs)) if media: if activity_id: media = [media] activities += [self.media_to_activity(m) for m in util.trim_nulls(media)] if group_id == source.SELF and fetch_likes: # add the user's own likes liked = self.urlopen( util.add_query_params(API_USER_LIKES_URL % user_id, kwargs)) if liked: user = self.urlopen(API_USER_URL % user_id) activities += [self.like_to_object(user, l['id'], l['link']) for l in liked] except urllib2.HTTPError, e: code, body = util.interpret_http_exception(e) # instagram api should give us back a json block describing the # error. but if it's an error for some other reason, it probably won't # be properly formatted json. try: body_obj = json.loads(body) if body else {} except ValueError: body_obj = {} if body_obj.get('meta', {}).get('error_type') == 'APINotFoundError': logging.exception(body_obj.get('meta', {}).get('error_message')) else: raise e
def fetch_mentions(self, username, tweets, min_id=None): """Fetches a user's @-mentions and returns them as ActivityStreams. Tries to only include explicit mentions, not mentions automatically created by @-replying. See the get_activities() docstring for details. Args: username: string tweets: list of Twitter API objects. used to find quote tweets quoting them. min_id: only return activities with ids greater than this Returns: list of activity dicts """ # get @-name mentions url = API_SEARCH % { 'q': urllib.quote_plus('@' + username), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) candidates = self.urlopen(url)['statuses'] # fetch in-reply-to tweets (if any) in_reply_to_ids = util.trim_nulls( [c.get('in_reply_to_status_id_str') for c in candidates]) origs = { o.get('id_str'): o for o in self.urlopen(API_LOOKUP % ','.join(in_reply_to_ids)) } if in_reply_to_ids else {} # filter out tweets that we don't consider mentions mentions = [] for c in candidates: if (c.get('user', {}).get('screen_name') == username or c.get('retweeted_status')): continue reply_to = origs.get(c.get('in_reply_to_status_id_str')) if not reply_to: mentions.append(c) else: reply_to_user = reply_to.get('user', {}).get('screen_name') mentioned = [u.get('screen_name') for u in reply_to.get('entities', {}).get('user_mentions', [])] if username != reply_to_user and username not in mentioned: mentions.append(c) # search for quote tweets # Guideline ("Limit your searches to 10 keywords and operators.") # implies fewer, but 20 IDs seems to work in practice. # https://dev.twitter.com/rest/public/search for batch in [ tweets[i:i + QUOTE_SEARCH_BATCH_SIZE] for i in xrange(0, len(tweets), QUOTE_SEARCH_BATCH_SIZE) ]: batch_ids = [t['id_str'] for t in batch] url = API_SEARCH % { 'q': urllib.quote_plus(' OR '.join(batch_ids)), 'count': 100, } if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) candidates = self.urlopen(url)['statuses'] for c in candidates: quoted_status_id = c.get('quoted_status_id_str') if quoted_status_id and quoted_status_id in batch_ids: mentions.append(c) return mentions
def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, fetch_mentions=False, search_query=None, scrape=False, cookie=None, ignore_rate_limit=False, **kwargs): """Fetches posts and converts them to ActivityStreams activities. See method docstring in source.py for details. app_id is ignored. Supports min_id, but not ETag, since Instagram doesn't support it. http://instagram.com/developer/endpoints/users/#get_users_feed http://instagram.com/developer/endpoints/users/#get_users_media_recent Likes are always included, regardless of the fetch_likes kwarg. They come bundled in the 'likes' field of the API Media object: http://instagram.com/developer/endpoints/media/# Mentions are never fetched or included because the API doesn't support searching for them. https://github.com/snarfed/bridgy/issues/523#issuecomment-155523875 Shares are never fetched included since there is no share feature. Instagram only supports search over hashtags, so if search_query is set, it must begin with #. May populate a custom 'ig_like_count' property in media objects. (Currently only when scraping.) Args: scrape: if True, scrapes HTML from instagram.com instead of using the API. Populates the user's actor object in the 'actor' response field. Useful for apps that haven't yet been approved in the new permissions approval process. Currently only supports group_id=SELF. Also supports passing a shortcode as activity_id as well as the internal API id. http://developers.instagram.com/post/133424514006/instagram-platform-update cookie: string, only used if scrape=True ignore_rate_limit: boolean, for scraping, always make an HTTP request, even if we've been rate limited recently **: see :meth:`Source.get_activities_response` Raises: InstagramAPIError """ if group_id is None: group_id = source.FRIENDS if scrape or self.scrape: if not (activity_id or (group_id == source.SELF and user_id) or (group_id == source.FRIENDS and cookie)): raise NotImplementedError( 'Scraping only supports activity_id, user_id and group_id=@self, or cookie and group_id=@friends.' ) elif fetch_likes and not cookie and not self.cookie: raise NotImplementedError('Scraping likes requires a cookie.') # cache rate limited responses and short circuit global _last_rate_limited, _last_rate_limited_exc now = datetime.datetime.now() if not ignore_rate_limit and _last_rate_limited: retry = _last_rate_limited + RATE_LIMIT_BACKOFF if now < retry: logging.info( 'Remembered rate limit at %s, waiting until %s to try again.', _last_rate_limited, retry) assert _last_rate_limited_exc raise _last_rate_limited_exc try: return self._scrape(user_id=user_id, group_id=group_id, activity_id=activity_id, count=count, cookie=cookie, fetch_extras=fetch_replies or fetch_likes, cache=cache) except Exception as e: code, body = util.interpret_http_exception(e) if not ignore_rate_limit and code in ('429', '503'): logging.info('Got rate limited! Remembering for %s', str(RATE_LIMIT_BACKOFF)) _last_rate_limited = now _last_rate_limited_exc = e raise if user_id is None: user_id = 'self' if search_query: if search_query.startswith('#'): search_query = search_query[1:] else: raise ValueError( 'Instagram only supports search over hashtags, so search_query must ' 'begin with the # character.') # TODO: paging media = [] kwargs = {} if min_id is not None: kwargs['min_id'] = min_id activities = [] try: media_url = ( API_MEDIA_URL % activity_id if activity_id else API_USER_MEDIA_URL % user_id if group_id == source.SELF else API_MEDIA_POPULAR_URL if group_id == source.ALL else API_MEDIA_SEARCH_URL % search_query if group_id == source.SEARCH else API_USER_FEED_URL if group_id == source.FRIENDS else None) assert media_url media = self.urlopen(util.add_query_params(media_url, kwargs)) if media: if activity_id: media = [media] activities += [ self.media_to_activity(m) for m in util.trim_nulls(media) ] if group_id == source.SELF and fetch_likes: # add the user's own likes liked = self.urlopen( util.add_query_params(API_USER_LIKES_URL % user_id, kwargs)) if liked: user = self.urlopen(API_USER_URL % user_id) activities += [ self.like_to_object(user, l['id'], l['link']) for l in liked ] except urllib_error.HTTPError as e: code, body = util.interpret_http_exception(e) # instagram api should give us back a json block describing the # error. but if it's an error for some other reason, it probably won't # be properly formatted json. try: body_obj = json.loads(body) if body else {} except ValueError: body_obj = {} if body_obj.get('meta', {}).get('error_type') == 'APINotFoundError': logging.warning(body_obj.get('meta', {}).get('error_message'), exc_info=True) else: raise e return self.make_activities_base_response(activities)
def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, search_query=None): """Fetches posts and converts them to ActivityStreams activities. See method docstring in source.py for details. app_id is ignored. Supports min_id, but not ETag, since Instagram doesn't support it. http://instagram.com/developer/endpoints/users/#get_users_feed http://instagram.com/developer/endpoints/users/#get_users_media_recent Likes are always included, regardless of the fetch_likes kwarg. They come bundled in the 'likes' field of the API Media object: http://instagram.com/developer/endpoints/media/# Instagram doesn't have a reshare feature, so shares are never included since they don't exist. :P Raises: InstagramAPIError """ if user_id is None: user_id = 'self' if group_id is None: group_id = source.FRIENDS # TODO: paging media = [] kwargs = {} if min_id is not None: kwargs['min_id'] = min_id activities = [] try: media_url = ( API_MEDIA_URL % activity_id if activity_id else API_USER_MEDIA_URL % user_id if group_id == source.SELF else API_MEDIA_POPULAR_URL if group_id == source.ALL else API_MEDIA_SEARCH_URL % search_query if group_id == source.SEARCH else API_USER_FEED_URL if group_id == source.FRIENDS else None) assert media_url media = self.urlopen(util.add_query_params(media_url, kwargs)) if media: if activity_id: media = [media] activities += [ self.media_to_activity(m) for m in util.trim_nulls(media) ] if group_id == source.SELF and fetch_likes: # add the user's own likes liked = self.urlopen( util.add_query_params(API_USER_LIKES_URL % user_id, kwargs)) if liked: user = self.urlopen(API_USER_URL % user_id) activities += [ self.like_to_object(user, l['id'], l['link']) for l in liked ] except urllib2.HTTPError, e: code, body = oauth_handlers.interpret_http_exception(e) # instagram api should give us back a json block describing the # error. but if it's an error for some other reason, it probably won't # be properly formatted json. try: body_obj = json.loads(body) if body else {} except ValueError: body_obj = {} if body_obj.get('meta', {}).get('error_type') == 'APINotFoundError': logging.exception( body_obj.get('meta', {}).get('error_message')) else: raise e
def maybe_add_or_delete_source(source_cls, auth_entity, state, **kwargs): """Adds or deletes a source if auth_entity is not None. Used in each source's oauth-dropins :meth:`Callback.finish()` and :meth:`Callback.get()` methods, respectively. Args: source_cls: source class, e.g. :class:`instagram.Instagram` auth_entity: ouath-dropins auth entity state: string, OAuth callback state parameter. a JSON serialized dict with operation, feature, and an optional callback URL. For deletes, it will also include the source key kwargs: passed through to the source_cls constructor Returns: source entity if it was created or updated, otherwise None """ state_obj = util.decode_oauth_state(state) operation = state_obj.get('operation', 'add') feature = state_obj.get('feature') callback = state_obj.get('callback') user_url = state_obj.get('user_url') logger.debug( 'maybe_add_or_delete_source with operation=%s, feature=%s, callback=%s', operation, feature, callback) logins = None if operation == 'add': # this is an add/update if not auth_entity: # TODO: only show if we haven't already flashed another message? # get_flashed_messages() caches so it's dangerous to call to check; # use eg session.get('_flashes', []) instead. # https://stackoverflow.com/a/17243946/186123 flash("OK, you're not signed up. Hope you reconsider!") if callback: callback = util.add_query_params(callback, {'result': 'declined'}) logger.debug( f'user declined adding source, redirect to external callback {callback}' ) redirect(callback) else: redirect('/') logger.info( f'{source_cls.__class__.__name__}.create_new with {auth_entity.key}, {state}, {kwargs}' ) source = source_cls.create_new( auth_entity=auth_entity, features=feature.split(',') if feature else [], user_url=user_url, **kwargs) if source: # if we're normalizing username case to lower case to make the key id, check # if there's and old Source with a capitalized key id, and if so, disable it # https://github.com/snarfed/bridgy/issues/884 if source.USERNAME_KEY_ID and source.username != source.key_id(): @ndb.transactional() def maybe_disable_original(): orig = source_cls.get_by_id(source.username) if orig: logging.info( f'Disabling {orig.bridgy_url()} for lower case {source.bridgy_url()}' ) orig.features = [] orig.put() maybe_disable_original() # add to login cookie logins = get_logins() logins.append( Login(path=source.bridgy_path(), site=source.SHORT_NAME, name=source.label_name())) if callback: callback = util.add_query_params( callback, { 'result': 'success', 'user': source.bridgy_url(), 'key': source.key.urlsafe().decode(), } if source else {'result': 'failure'}) logger.debug( 'finished adding source, redirect to external callback %s', callback) redirect(callback, logins=logins) elif not source.domains: redirect('/edit-websites?' + urllib.parse.urlencode({ 'source_key': source.key.urlsafe().decode(), }), logins=logins) else: redirect(source.bridgy_url(), logins=logins) # no source redirect('/') else: # this is a delete if auth_entity: # TODO: remove from logins cookie redirect( f'/delete/finish?auth_entity={auth_entity.key.urlsafe().decode()}&state={state}' ) else: flash( f'If you want to disable, please approve the {source_cls.GR_CLASS.NAME} prompt.' ) source_key = state_obj.get('source') if source_key: source = ndb.Key(urlsafe=source_key).get() if source: redirect(source.bridgy_url()) redirect('/')
class Twitter(source.Source): """Implements the ActivityStreams API for Twitter. """ DOMAIN = 'twitter.com' BASE_URL = 'https://twitter.com/' NAME = 'Twitter' FRONT_PAGE_TEMPLATE = 'templates/twitter_index.html' # HTML snippet for embedding a tweet. # https://dev.twitter.com/docs/embedded-tweets EMBED_POST = """ <script async defer src="//platform.twitter.com/widgets.js" charset="utf-8"></script> <br /> <blockquote class="twitter-tweet" lang="en" data-dnt="true"> <p>%(content)s <a href="%(url)s">#</a></p> </blockquote> """ def __init__(self, access_token_key, access_token_secret, username=None): """Constructor. Twitter now requires authentication in v1.1 of their API. You can get an OAuth access token by creating an app here: https://dev.twitter.com/apps/new Args: access_token_key: string, OAuth access token key access_token_secret: string, OAuth access token secret username: string, optional, the current user. Used in e.g. preview/create. """ self.access_token_key = access_token_key self.access_token_secret = access_token_secret self.username = username def get_actor(self, screen_name=None): """Returns a user as a JSON ActivityStreams actor dict. Args: screen_name: string username. Defaults to the current user. """ if screen_name is None: url = API_CURRENT_USER else: url = API_USER % screen_name return self.user_to_actor(self.urlopen(url)) def get_activities_response(self, user_id=None, group_id=None, app_id=None, activity_id=None, start_index=0, count=0, etag=None, min_id=None, cache=None, fetch_replies=False, fetch_likes=False, fetch_shares=False, fetch_events=False, fetch_mentions=False, search_query=None, **kwargs): """Fetches posts and converts them to ActivityStreams activities. XXX HACK: this is currently hacked for bridgy to NOT pass min_id to the request for fetching activity tweets themselves, but to pass it to all of the requests for filling in replies, retweets, etc. That's because we want to find new replies and retweets of older initial tweets. TODO: find a better way. See :meth:`source.Source.get_activities_response()` for details. app_id is ignored. min_id is translated to Twitter's since_id. The code for handling ETags (and 304 Not Changed responses and setting If-None-Match) is here, but unused right now since Twitter evidently doesn't support ETags. From https://dev.twitter.com/discussions/5800 : "I've confirmed with our team that we're not explicitly supporting this family of features." Likes (ie favorites) are scraped from twitter.com HTML, since Twitter's REST API doesn't offer a way to fetch them. You can also get them from the Streaming API, though, and convert them with streaming_event_to_object(). https://dev.twitter.com/docs/streaming-apis/messages#Events_event Shares (ie retweets) are fetched with a separate API call per tweet: https://dev.twitter.com/docs/api/1.1/get/statuses/retweets/%3Aid However, retweets are only fetched for the first 15 tweets that have them, since that's Twitter's rate limit per 15 minute window. :( https://dev.twitter.com/docs/rate-limiting/1.1/limits Quote tweets are fetched by searching for the possibly quoted tweet's ID, using the OR operator to search up to 5 IDs at a time, and then checking the quoted_status_id_str field https://dev.twitter.com/overview/api/tweets#quoted_status_id_str Use the group_id @self to retrieve a user_id’s timeline. If user_id is None or @me, it will return tweets for the current API user. group_id can be used to specify the slug of a list for which to return tweets. By default the current API user’s lists will be used, but lists owned by other users can be fetched by explicitly passing a username to user_id, e.g. to fetch tweets from the list @exampleuser/example-list you would call get_activities(user_id='exampleuser', group_id='example-list'). Twitter replies default to including a mention of the user they're replying to, which overloads mentions a bit. When fetch_shares is True, we determine that a tweet mentions the current user if it @-mentions their username and: * it's not a reply, OR * it's a reply, but not to the current user, AND * the tweet it's replying to doesn't @-mention the current user """ if group_id is None: group_id = source.FRIENDS # nested function for lazily fetching the user object if we need it user = [] def _user(): if not user: user.append(self.urlopen(API_USER % user_id if user_id else API_CURRENT_USER)) return user[0] if count: count += start_index activities = [] if activity_id: tweets = [self.urlopen(API_STATUS % activity_id)] total_count = len(tweets) else: if group_id == source.SELF: if user_id in (None, source.ME): user_id = '' url = API_USER_TIMELINE % { 'count': count, 'screen_name': user_id, } if fetch_likes: liked = self.urlopen(API_FAVORITES % user_id) if liked: activities += [self._make_like(tweet, _user()) for tweet in liked] elif group_id == source.SEARCH: url = API_SEARCH % { 'q': urllib.quote_plus(search_query.encode('utf-8')), 'count': count, } elif group_id in (source.FRIENDS, source.ALL): url = API_TIMELINE % (count) else: if not user_id: user_id = _user().get('screen_name') url = API_LIST_TIMELINE % { 'count': count, 'slug': group_id, 'owner_screen_name': user_id, } headers = {'If-None-Match': etag} if etag else {} total_count = None try: resp = self.urlopen(url, headers=headers, parse_response=False) etag = resp.info().get('ETag') tweet_obj = source.load_json(resp.read(), url) if group_id == source.SEARCH: tweet_obj = tweet_obj.get('statuses', []) tweets = tweet_obj[start_index:] except urllib2.HTTPError, e: if e.code == 304: # Not Modified, from a matching ETag tweets = [] else: raise # batch get memcached counts of favorites and retweets for all tweets cached = {} if cache is not None: keys = itertools.product(('ATR', 'ATF'), [t['id_str'] for t in tweets]) cached = cache.get_multi('%s %s' % (prefix, id) for prefix, id in keys) # only update the cache at the end, in case we hit an error before then cache_updates = {} if fetch_shares: retweet_calls = 0 for tweet in tweets: # don't fetch retweets the tweet is itself a retweet or if the # author's account is protected. /statuses/retweets 403s with error # code 200 (?!) for protected accounts. # https://github.com/snarfed/bridgy/issues/688 if tweet.get('retweeted') or tweet.get('user', {}).get('protected'): continue elif retweet_calls >= RETWEET_LIMIT: logging.warning("Hit Twitter's retweet rate limit (%d) with more to " "fetch! Results will be incomplete!" % RETWEET_LIMIT) break # store retweets in the 'retweets' field, which is handled by # tweet_to_activity(). # TODO: make these HTTP requests asynchronous. not easy since we don't # (yet) require threading support or use a non-blocking HTTP library. # # twitter limits this API endpoint to one call per minute per user, # which is easy to hit, so we stop before we hit that. # https://dev.twitter.com/docs/rate-limiting/1.1/limits # # can't use the statuses/retweets_of_me endpoint because it only # returns the original tweets, not the retweets or their authors. id = tweet['id_str'] count = tweet.get('retweet_count') if count and count != cached.get('ATR ' + id): url = API_RETWEETS % id if min_id is not None: url = util.add_query_params(url, {'since_id': min_id}) try: tweet['retweets'] = self.urlopen(url) except urllib2.URLError, e: code, _ = util.interpret_http_exception(e) if code != '404': # 404 means the original tweet was deleted raise retweet_calls += 1 cache_updates['ATR ' + id] = count