def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls(util.trim_nulls( [site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def _url_and_domain(self, auth_entity): """Returns this source's URL and domain. Uses the auth entity user_json 'url' field by default. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth Returns: (string url, string domain, boolean ok) tuple """ user_json = json.loads(auth_entity.user_json) actor = self.as_source.user_to_actor(user_json) urls = util.trim_nulls([actor.get('url')] + # also look at G+'s urls field [u.get('value') for u in user_json.get('urls', [])]) first_url = first_domain = None for url in urls: # TODO: fully support multiple urls for url in url.split(): url, domain, ok = util.get_webmention_target(url) if ok: domain = domain.lower() return url, domain, True elif not first_url: first_url = url first_domain = domain return first_url, first_domain, False
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def get_or_save(self, source, restart=False): resp = super(Response, self).get_or_save() if (self.type != resp.type or source.gr_source.activity_changed( json_loads(resp.response_json), json_loads(self.response_json), log=True)): logging.info('Response changed! Re-propagating. Original: %s' % resp) resp.old_response_jsons = resp.old_response_jsons[:10] + [ resp.response_json ] response_json_to_append = json_loads(self.response_json) source.gr_source.append_in_reply_to(json_loads(resp.response_json), response_json_to_append) self.response_json = json_dumps( util.trim_nulls(response_json_to_append)) resp.response_json = self.response_json resp.restart(source) elif restart and resp is not self: # ie it already existed resp.restart(source) return resp
def new(handler, auth_entity=None, **kwargs): """Creates and returns a WordPress for the logged in user. Args: handler: the current RequestHandler auth_entity: oauth_dropins.wordpress.WordPressAuth """ auth_domain = auth_entity.key.id() site_info = WordPress.get_site_info(handler, auth_entity) if site_info is None: return urls = util.dedupe_urls( util.trim_nulls([site_info.get('URL'), auth_entity.blog_url])) domains = [util.domain_from_link(u) for u in urls] avatar = (json.loads(auth_entity.user_json).get('avatar_URL') if auth_entity.user_json else None) return WordPress(id=domains[0], auth_entity=auth_entity.key, name=auth_entity.user_display_name(), picture=avatar, superfeedr_secret=util.generate_secret(), url=urls[0], domain_urls=urls, domains=domains, site_info=site_info, **kwargs)
def tweet_to_activity(self, tweet): """Converts a tweet to an activity. Args: tweet: dict, a decoded JSON tweet Returns: an ActivityStreams activity dict, ready to be JSON-encoded """ object = self.tweet_to_object(tweet) activity = { 'verb': 'post', 'published': object.get('published'), 'id': object.get('id'), 'url': object.get('url'), 'actor': object.get('author'), 'object': object, } # yes, the source field has an embedded HTML link. bleh. # https://dev.twitter.com/docs/api/1/get/statuses/show/ parsed = re.search('<a href="([^"]+)".*>(.+)</a>', tweet.get('source', '')) if parsed: url, name = parsed.groups() activity['generator'] = {'displayName': name, 'url': url} return util.trim_nulls(activity)
def process_webmention_links(self, e): """Generates pretty HTML for the links in a :class:`Webmentions` entity. Args: e: :class:`Webmentions` subclass (:class:`Response` or :class:`BlogPost`) """ link = lambda url, g: util.pretty_link( url, glyphicon=g, attrs={'class': 'original-post u-bridgy-target'}, new_tab=True) return util.trim_nulls({ 'Failed': set(link(url, 'exclamation-sign') for url in e.error + e.failed), 'Sending': set( link(url, 'transfer') for url in e.unsent if url not in e.error), 'Sent': set( link(url, None) for url in e.sent if url not in (e.error + e.unsent)), 'No <a href="http://indiewebify.me/#send-webmentions">webmention</a> ' 'support': set(link(url, None) for url in e.skipped), })
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: oauth_dropins.models.BaseAuth user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) urls = [] for url in util.trim_nulls(util.uniquify( [user_url] + [actor.get('url')] + [u.get('value') for u in actor.get('urls', [])])): domain = util.domain_from_link(url) if domain and not util.in_webmention_blacklist(domain.lower()): urls.append(url) urls = util.dedupe_urls(urls) domains = [util.domain_from_link(url).lower() for url in urls] return urls, domains
def post_to_activity(self, post): """Converts a post to an activity. Args: post: dict, a decoded JSON post Returns: an ActivityStreams activity dict, ready to be JSON-encoded """ object = self.post_to_object(post) activity = { 'verb': 'post', 'published': object.get('published'), 'updated': object.get('updated'), 'id': object.get('id'), 'url': object.get('url'), 'actor': object.get('author'), 'object': object, } application = post.get('application') if application: activity['generator'] = { 'displayName': application.get('name'), 'id': self.tag_uri(application.get('id')), } return util.trim_nulls(activity)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.warning( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): url, domain, send = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if send: urls.append(url) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def finish(self, auth_entity, state=None): if 'target_url' in self.decode_state_parameter(state): # this is an interactive publish return self.redirect(util.add_query_params( '/publish/instagram/finish', util.trim_nulls({'auth_entity': auth_entity.key.urlsafe(), 'state': state}))) self.maybe_add_or_delete_source(Instagram, auth_entity, state)
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urllib.parse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">add an Instagram rel-me link</a>, then try again.' ) return self.redirect('/') # check that instagram profile links to web site try: actor = gr_instagram.Instagram(scrape=True).get_actor( username, ignore_rate_limit=True) except Exception as e: code, _ = util.interpret_http_exception(e) if code in Instagram.RATE_LIMIT_HTTP_CODES: self.messages.add( '<a href="https://github.com/snarfed/bridgy/issues/665#issuecomment-524977427">Apologies, Instagram is temporarily blocking us.</a> Please try again later!' ) return self.redirect('/') else: raise if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me link and your Instagram account." % username) return self.redirect('/') canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add( "Please add %s to your Instagram profile's website or bio field and try again." % website) return self.redirect('/') # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add( 'Your Instagram account is private. Bridgy only supports public accounts.' ) return self.redirect('/') self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target( url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith( util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def urls_and_domains(self, auth_entity, user_url, actor=None, resolve_source_domain=True): """Returns this user's valid (not webmention-blocklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing actor: dict, optional AS actor for the user. If provided, overrides auth_entity resolve_source_domain: boolean, whether to follow redirects on URLs on this source's domain Returns: ([string url, ...], [string domain, ...]) """ if not actor: actor = self.gr_source.user_to_actor(json_loads(auth_entity.user_json)) logger.debug(f'Extracting URLs and domains from actor: {json_dumps(actor, indent=2)}') candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logger.info(f'Too many profile links! Only resolving the first {MAX_AUTHOR_URLS}: {candidates}') urls = [] for i, url in enumerate(candidates): on_source_domain = util.domain_from_link(url) == self.gr_source.DOMAIN resolve = ((resolve_source_domain or not on_source_domain) and i < MAX_AUTHOR_URLS) resolved = self.resolve_profile_url(url, resolve=resolve) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blocklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ user = json_loads(auth_entity.user_json) actor = ( user.get('actor') # for Instagram; its user_json is IndieAuth or self.gr_source.user_to_actor(user)) logging.debug('Extracting URLs and domains from actor: %s', json_dumps(actor, indent=2)) candidates = util.trim_nulls( util.uniquify([user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info( 'Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): resolved = self.resolve_profile_url(url, resolve=i < MAX_AUTHOR_URLS) if resolved: urls.append(resolved) final_urls = [] domains = [] for url in util.dedupe_urls(urls): # normalizes domains to lower case # skip links on this source's domain itself. only currently needed for # Mastodon; the other silo domains are in the webmention blacklist. domain = util.domain_from_link(url) if domain != self.gr_source.DOMAIN: final_urls.append(url) domains.append(domain) return final_urls, domains
def _urls_and_domains(self, auth_entity, user_url): """Returns this user's valid (not webmention-blacklisted) URLs and domains. Converts the auth entity's user_json to an ActivityStreams actor and uses its 'urls' and 'url' fields. May be overridden by subclasses. Args: auth_entity: :class:`oauth_dropins.models.BaseAuth` user_url: string, optional URL passed in when authorizing Returns: ([string url, ...], [string domain, ...]) """ actor = self.gr_source.user_to_actor(json.loads(auth_entity.user_json)) logging.debug('Converted to actor: %s', json.dumps(actor, indent=2)) candidates = util.trim_nulls(util.uniquify( [user_url] + microformats2.object_urls(actor))) if len(candidates) > MAX_AUTHOR_URLS: logging.info('Too many profile links! Only resolving the first %s: %s', MAX_AUTHOR_URLS, candidates) urls = [] for i, url in enumerate(candidates): final, domain, ok = util.get_webmention_target(url, resolve=i < MAX_AUTHOR_URLS) if ok: final = final.lower() if util.schemeless(final).startswith(util.schemeless(url.lower())): # redirected to a deeper path. use the original higher level URL. #652 final = url # If final has a path segment check if root has a matching rel=me. match = re.match(r'^(https?://[^/]+)/.+', final) if match and i < MAX_AUTHOR_URLS: root = match.group(1) resp = util.requests_get(root) resp.raise_for_status() data = util.mf2py_parse(resp.text, root) me_urls = data.get('rels', {}).get('me', []) if final in me_urls: final = root urls.append(final) urls = util.dedupe_urls(urls) # normalizes domains to lower case domains = [util.domain_from_link(url) for url in urls] return urls, domains
def process_webmention_links(self, e): """Generates pretty HTML for the links in a BlogWebmention entity. Args: e: BlogWebmention subclass (Response or BlogPost) """ link = lambda url, g: util.pretty_link( url, glyphicon=g, attrs={'class': 'original-post u-bridgy-target'}, new_tab=True) return util.trim_nulls({ 'Failed': set(link(url, 'exclamation-sign') for url in e.error + e.failed), 'Sending': set(link(url, 'transfer') for url in e.unsent if url not in e.error), 'Sent': set(link(url, None) for url in e.sent if url not in (e.error + e.unsent)), 'No <a href="http://indiewebify.me/#send-webmentions">webmention</a> ' 'support': set(link(url, None) for url in e.skipped), })
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urlparse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">' 'add an Instagram rel-me link</a>, then try again.') return self.redirect('/') # check that instagram profile links to web site actor = gr_instagram.Instagram(scrape=True).get_actor( username, ignore_rate_limit=True) if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me " "link and your Instagram account." % username) return self.redirect('/') canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add( "Please add %s to your Instagram profile's website or " 'bio field and try again.' % website) return self.redirect('/') # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add('Your Instagram account is private. ' 'Bridgy only supports public accounts.') return self.redirect('/') self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def canonicalize_url(self, url, activity=None, **kwargs): """Facebook-specific standardization of syndicated urls. Canonical form is https://www.facebook.com/USERID/posts/POSTID Args: url: a string, the url of the syndicated content activity: the activity this URL came from. If it has an fb_object_id, we'll use that instead of fetching the post from Facebook kwargs: unused Return: a string, the canonical form of the syndication url """ if util.domain_from_link(url) != self.gr_source.DOMAIN: return None def post_url(id): return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id) parsed = urllib.parse.urlparse(url) params = urllib.parse.parse_qs(parsed.query) path = parsed.path.strip('/').split('/') url_id = self.gr_source.post_id(url) ids = params.get('story_fbid') or params.get('fbid') if ids: url = post_url(ids[0]) elif url_id: if path and path[0] == 'notes': url = post_url(url_id) else: object_id = self.cached_resolve_object_id(url_id, activity=activity) if object_id: url = post_url(object_id) elif path and len(path) > 1 and path[1] == 'posts': url = post_url(url_id) for alternate_id in util.trim_nulls( itertools.chain((self.username or self.inferred_username, ), self.inferred_user_ids)): url = url.replace('facebook.com/%s/' % alternate_id, 'facebook.com/%s/' % self.key.id()) return super(FacebookPage, self).canonicalize_url(url)
def tweet_to_object(self, tweet): """Converts a tweet to an object. Args: tweet: dict, a decoded JSON tweet Returns: an ActivityStreams object dict, ready to be JSON-encoded """ object = {} id = tweet.get('id') if not id: return {} object = { 'objectType': 'note', 'published': self.rfc2822_to_iso8601(tweet.get('created_at')), 'content': tweet.get('text'), } user = tweet.get('user') if user: object['author'] = self.user_to_actor(user) username = object['author'].get('username') if username: object['id'] = self.tag_uri(id) object['url'] = 'http://twitter.com/%s/status/%d' % (username, id) # currently the media list will only have photos. if that changes, though, # we'll need to make this conditional on media.type. # https://dev.twitter.com/docs/tweet-entities media_url = tweet.get('entities', {}).get('media', [{}])[0].get('media_url') if media_url: object['image'] = {'url': media_url} place = tweet.get('place') if place: object['location'] = { 'displayName': place.get('full_name'), 'id': place.get('id'), 'url': place.get('url'), } return util.trim_nulls(object)
def finish(self, auth_entity, state=None): if auth_entity: user_json = json.loads(auth_entity.user_json) # find instagram profile URL urls = user_json.get('rel-me', []) logging.info('rel-mes: %s', urls) for url in util.trim_nulls(urls): if util.domain_from_link(url) == gr_instagram.Instagram.DOMAIN: username = urlparse.urlparse(url).path.strip('/') break else: self.messages.add( 'No Instagram profile found. Please <a href="https://indieauth.com/setup">' 'add an Instagram rel-me link</a>, then try again.') return self.redirect_home_or_user_page(state) # check that instagram profile links to web site actor = gr_instagram.Instagram(scrape=True).get_actor(username) if not actor: self.messages.add( "Couldn't find Instagram user '%s'. Please check your site's rel-me " "link and your Instagram account." % username) return self.redirect_home_or_user_page(state) canonicalize = util.UrlCanonicalizer(redirects=False) website = canonicalize(auth_entity.key.id()) urls = [canonicalize(u) for u in microformats2.object_urls(actor)] logging.info('Looking for %s in %s', website, urls) if website not in urls: self.messages.add("Please add %s to your Instagram profile's website or " 'bio field and try again.' % website) return self.redirect_home_or_user_page(state) # check that the instagram account is public if not gr_source.Source.is_public(actor): self.messages.add('Your Instagram account is private. ' 'Bridgy only supports public accounts.') return self.redirect_home_or_user_page(state) source = self.maybe_add_or_delete_source(Instagram, auth_entity, state, actor=actor)
def post_to_object(self, post): """Converts a post to an object. Args: post: dict, a decoded JSON post Returns: an ActivityStreams object dict, ready to be JSON-encoded """ object = {} id = post.get('id') if not id: return {} object = { 'id': self.tag_uri(str(id)), 'objectType': 'note', 'published': post.get('created_time'), 'updated': post.get('updated_time'), 'content': post.get('message'), 'author': self.user_to_actor(post.get('from')), # FB post ids are of the form USERID_POSTID 'url': 'http://facebook.com/' + id.replace('_', '/posts/'), 'image': {'url': post.get('picture')}, } place = post.get('place') if place: object['location'] = { 'displayName': place.get('name'), 'id': place.get('id'), } location = place.get('location', {}) lat = location.get('latitude') lon = location.get('longitude') if lat and lon: # ISO 6709 location string. details: http://en.wikipedia.org/wiki/ISO_6709 object['location']['position'] = '%+f%+f/' % (lat, lon) return util.trim_nulls(object)
def canonicalize_url(self, url, activity=None, **kwargs): """Facebook-specific standardization of syndicated urls. Canonical form is https://www.facebook.com/USERID/posts/POSTID Args: url: a string, the url of the syndicated content activity: the activity this URL came from. If it has an fb_object_id, we'll use that instead of fetching the post from Facebook kwargs: unused Return: a string, the canonical form of the syndication url """ if util.domain_from_link(url) != self.gr_source.DOMAIN: return None def post_url(id): return 'https://www.facebook.com/%s/posts/%s' % (self.key.id(), id) parsed = urlparse.urlparse(url) params = urlparse.parse_qs(parsed.query) url_id = self.gr_source.post_id(url) ids = params.get('story_fbid') or params.get('fbid') if ids: url = post_url(ids[0]) elif url_id: if parsed.path.startswith('/notes/'): url = post_url(url_id) else: object_id = self.cached_resolve_object_id(url_id, activity=activity) if object_id: url = post_url(object_id) for alternate_id in util.trim_nulls(itertools.chain( (self.username or self.inferred_username,), self.inferred_user_ids)): url = url.replace('facebook.com/%s/' % alternate_id, 'facebook.com/%s/' % self.key.id()) return super(FacebookPage, self).canonicalize_url(url)
def user_to_actor(self, user): """Converts a tweet to an activity. Args: user: dict, a decoded JSON Twitter user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ username = user.get('screen_name') if not username: return {} return util.trim_nulls({ 'displayName': user.get('name'), 'image': {'url': user.get('profile_image_url')}, 'id': self.tag_uri(username) if username else None, 'published': self.rfc2822_to_iso8601(user.get('created_at')), 'url': 'http://twitter.com/%s' % username, 'location': {'displayName': user.get('location')}, 'username': username, 'description': user.get('description'), })
def user_to_actor(self, user): """Converts a user to an actor. Args: user: dict, a decoded JSON Facebook user Returns: an ActivityStreams actor dict, ready to be JSON-encoded """ if not user: return {} id = user.get('id') username = user.get('username') handle = username or id if not handle: return {} # facebook implements this as a 302 redirect image_url = 'http://graph.facebook.com/%s/picture?type=large' % handle actor = { 'displayName': user.get('name'), 'image': {'url': image_url}, 'id': self.tag_uri(handle), 'updated': user.get('updated_time'), 'url': user.get('link'), 'username': username, 'description': user.get('bio'), } location = user.get('location') if location: actor['location'] = {'id': location.get('id'), 'displayName': location.get('name')} return util.trim_nulls(actor)
def test_trim_nulls(self): # basic self.assertEqual(None, util.trim_nulls(None)) self.assertEqual('foo', util.trim_nulls('foo')) self.assertEqual([], util.trim_nulls([])) self.assertEqual({}, util.trim_nulls({})) self.assertEqual(set(), util.trim_nulls(set())) self.assertEqual((), util.trim_nulls(())) self.assertEqual({1: 0}, util.trim_nulls({1: 0})) # numeric zero # lists self.assertEqual([{'xyz': 3}], util.trim_nulls([{'abc': None, 'xyz': 3}])) self.assertEqual({'a': ['b'], 'd': ['e']}, util.trim_nulls( {'a': ['b'], 'c': [None], 'd': [None, 'e', None], 'f': [[{}], {'a': []}]})) self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}, 4: set(), 5: frozenset()})) # sets self.assertEqual(set((1, 2)), util.trim_nulls(set((1, None, 2)))) self.assertEqual({'a': set(['b']), 'd': set(['e'])}, util.trim_nulls( {'a': set(['b']), 'c': set([None]), 'd': set([None, 'e', None])})) self.assertEqual(set(), util.trim_nulls(set((None,)))) # dicts self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4})) self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9})) self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}})) # iterator and generator self.assertEqual(['a', 'b'], list(util.trim_nulls(iter(['a', None, 'b'])))) self.assertEqual(['a', 'b'], list(util.trim_nulls(x for x in ['a', None, 'b'])))
def test_trim_nulls(self): # basic self.assertEqual(None, util.trim_nulls(None)) self.assertEqual('foo', util.trim_nulls('foo')) self.assertEqual([], util.trim_nulls([])) self.assertEqual({}, util.trim_nulls({})) self.assertEqual(set(), util.trim_nulls(set())) self.assertEqual((), util.trim_nulls(())) self.assertEqual({1: 0}, util.trim_nulls({1: 0})) # numeric zero # lists self.assertEqual([{'xyz': 3}], util.trim_nulls([{'abc': None, 'xyz': 3}])) self.assertEqual({'a': ['b'], 'd': ['e']}, util.trim_nulls( {'a': ['b'], 'c': [None], 'd': [None, 'e', None], 'f': [[{}], {'a': []}]})) self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}, 4: set(), 5: frozenset()})) # sets self.assertEqual(set((1, 2)), util.trim_nulls(set((1, None, 2)))) self.assertEqual({'a': set(['b']), 'd': set(['e'])}, util.trim_nulls( {'a': set(['b']), 'c': set([None]), 'd': set([None, 'e', None])})) self.assertEqual(set(), util.trim_nulls(set((None,)))) # dicts self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4})) self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9})) self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}}))
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get('id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update(u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get('author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info('Skipping response by blocked user: %s', json.dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning('Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info('Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity=json.dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses)
def get_activities_response(self, **kwargs): # TODO: use batch API to get photos, events, etc in one request # https://developers.facebook.com/docs/graph-api/making-multiple-requests try: resp = self.gr_source.get_activities_response(group_id=SELF, **kwargs) # if it's requesting one specific activity, then we're done if 'activity_id' in kwargs: return resp # also get uploaded photos manually since facebook sometimes collapses # multiple photos into albums, and the album post object won't have the # post content, comments, etc. from the individual photo posts. # http://stackoverflow.com/questions/12785120 # # TODO: save and use ETag for all of these extra calls photos = self.get_data(API_PHOTOS) # also get events and RSVPs # https://developers.facebook.com/docs/graph-api/reference/user/events/ # https://developers.facebook.com/docs/graph-api/reference/event#edges # TODO: also fetch and use API_USER_RSVPS_DECLINED user_rsvps = self.get_data(API_USER_RSVPS) # have to re-fetch the events because the user rsvps response doesn't # include the event description, which we need for original post links. events = [self.gr_source.urlopen(API_EVENT % r['id']) for r in user_rsvps if r.get('id')] # also, only process events that the user is the owner of. avoids (but # doesn't prevent) processing big non-indieweb events with tons of # attendees that put us over app engine's instance memory limit. details: # https://github.com/snarfed/bridgy/issues/77 events_and_rsvps = [(e, self.get_data(API_EVENT_RSVPS % e['id'])) for e in events if e.get('owner', {}).get('id') == self.key.id()] except urllib2.HTTPError as e: # Facebook API error details: # https://developers.facebook.com/docs/graph-api/using-graph-api/#receiving-errorcodes # https://developers.facebook.com/docs/reference/api/errors/ exc_type, _, exc_traceback = sys.exc_info() body = e.read() exc_copy = exc_type(e.filename, e.code, e.msg, e.hdrs, cStringIO.StringIO(body)) try: body_json = json.loads(body) except: logging.exception('Non-JSON response body: %s', body) # response isn't JSON. ignore and re-raise the original exception raise exc_type, exc_copy, exc_traceback error = body_json.get('error', {}) if error.get('code') in (102, 190): subcode = error.get('error_subcode') if subcode == 458: # revoked raise models.DisableSource() elif subcode in (463, 460): # expired, changed password # ask the user to reauthenticate self.gr_source.create_notification( self.key.id(), "Brid.gy's access to your account has expired. Click here to renew it now!", 'https://www.brid.gy/facebook/start') raise models.DisableSource() # other error. re-raise original exception raise exc_type, exc_copy, exc_traceback # add photos. they show up as both a post and a photo, each with a separate # id. the post's object_id field points to the photo's id. de-dupe by # switching the post to use the fb_object_id when it's provided. activities = resp.setdefault('items', []) activities_by_fb_id = {} for activity in activities: obj = activity.get('object', {}) fb_id = obj.get('fb_object_id') if not fb_id: continue activities_by_fb_id[fb_id] = activity for x in activity, obj: parsed = util.parse_tag_uri(x.get('id', '')) if parsed: _, orig_id = parsed x['id'] = self.gr_source.tag_uri(fb_id) x['url'] = x.get('url', '').replace(orig_id, fb_id) # merge comments and likes from existing photo objects, and add new ones. for photo in photos: photo_activity = self.gr_source.post_to_activity(photo) existing = activities_by_fb_id.get(photo.get('id')) if existing: existing['object'].setdefault('replies', {}).setdefault('items', []).extend( photo_activity['object'].get('replies', {}).get('items', [])) existing['object'].setdefault('tags', []).extend( [t for t in photo_activity['object'].get('tags', []) if t.get('verb') == 'like']) else: activities.append(photo_activity) # add events activities += [self.gr_source.event_to_activity(e, rsvps=r) for e, r in events_and_rsvps] # TODO: remove once we're confident in our id parsing. (i'm going to canary # with just a few users before i do it for everyone.) # # discard objects with ids with colons in them. Background: # https://github.com/snarfed/bridgy/issues/305 def remove_bad_ids(objs, label): ret = [] for o in objs: id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '') if id and ':' in id[1]: logging.warning('Cowardly ignoring %s with bad id: %s', label, id[1]) else: ret.append(o) return ret resp['items'] = remove_bad_ids(activities, 'activity') for activity in resp['items']: obj = activity.get('object', {}) obj['tags'] = remove_bad_ids(obj.setdefault('tags', []), 'tag/like') replies = obj.get('replies', {}) items = replies.get('items') if items: replies['items'] = remove_bad_ids(items, 'comment') replies['totalItems'] = len(replies['items']) return util.trim_nulls(resp)
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def backfeed(self, source, responses=None, activities=None): """Processes responses and activities and generates propagate tasks. Stores property names and values to update in source.updates. Args: source: Source responses: dict mapping AS response id to AS object activities: dict mapping AS activity id to AS object """ if responses is None: responses = {} if activities is None: activities = {} # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get( 'author', {}).get('id') != user_id and activity.get('verb') != 'share': for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json_dumps(resp, indent=2)) continue if source.is_blocked(resp): logging.info( 'Skipping response by blocked user: %s', json_dumps(resp.get('author') or resp.get('actor'), indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json_loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] source.blocked_ids = None for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) # new response to propagate! load block list if we haven't already if source.blocked_ids is None: source.load_blocklist() for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.info( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json_dumps( util.prune_activity(a, source)) for a in activities ], response_json=json_dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json_dumps(urls_to_activity) resp_entity.get_or_save(source, restart=self.RESTART_EXISTING_TASKS) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json_dumps( pruned_responses + unchanged_responses)
def test_none(self): self.assertEqual(None, util.trim_nulls(None))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
def test_nested_dict_with_nones(self): self.assertEqual({1: {3: 4}}, util.trim_nulls({1: {2: [], 3: 4}, 5: {6: None}}))
def test_string(self): self.assertEqual('foo', util.trim_nulls('foo'))
def test_simple_dict_with_nones(self): self.assertEqual({3: 4, 2: 9}, util.trim_nulls({1: None, 3: 4, 5: [], 2: 9}))
def test_simple_dict(self): self.assertEqual({1: 2, 3: 4}, util.trim_nulls({1: 2, 3: 4}))
def test_simple_dict_with_nulls(self): self.assertEqual({}, util.trim_nulls({1: None, 2: [], 3: {}}))
def test_empty_dict(self): self.assertEqual({}, util.trim_nulls({}))
def test_empty_list(self): self.assertEqual([], util.trim_nulls([]))