def source_url(self, target_url): # parse the response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) if self.entity.type in ('like', 'repost', 'rsvp'): response_id = response_id.split('_')[-1] # determine which activity to use activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json.loads(self.entity.urls_to_activity) if urls_to_activity: activity = self.activities[urls_to_activity[target_url]] # generate source URL id = activity['id'] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL) # currently have problems with brid.gy's SSL cert. details: # https://github.com/snarfed/bridgy/issues/20 if (self.request.host_url.endswith('brid.gy') or self.request.host_url.endswith('brid-gy.appspot.com')): host_url = 'https://brid-gy.appspot.com' else: host_url = self.request.host_url return '%s/%s/%s/%s/%s/%s' % ( host_url, self.entity.type, self.entity.source.get().SHORT_NAME, self.entity.source.string_id(), post_id, response_id)
def source_url(self, target_url): # determine which activity to use try: activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json.loads(self.entity.urls_to_activity) if urls_to_activity: activity = self.activities[urls_to_activity[target_url]] except (KeyError, IndexError): logging.warning( """\ Hit https://github.com/snarfed/bridgy/issues/237 KeyError! target url %s not in urls_to_activity: %s activities: %s""", target_url, self.entity.urls_to_activity, self.activities, ) self.abort(ERROR_HTTP_RETURN_CODE) # generate source URL id = activity["id"] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL) # currently have problems with brid.gy's SSL cert. details: # https://github.com/snarfed/bridgy/issues/20 if self.request.host_url.endswith("brid.gy") or self.request.host_url.endswith("brid-gy.appspot.com"): host_url = "https://brid-gy.appspot.com" else: host_url = self.request.host_url path = [ host_url, self.entity.type, self.entity.source.get().SHORT_NAME, self.entity.source.string_id(), post_id, ] if self.entity.type != "post": # parse and add response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) reaction_id = response_id if self.entity.type in ("like", "react", "repost", "rsvp"): response_id = response_id.split("_")[-1] # extract responder user id path.append(response_id) if self.entity.type == "react": path.append(reaction_id) return "/".join(path)
def dispatch_request(self, *args): source = self.auth() gr_src = self.gr_source() id = request.values['id'] # validate request parsed_id = util.parse_tag_uri(id) if not parsed_id: self.error(f'Scrape error: expected id to be tag URI; got {id}') activity = Activity.get_by_id(id) if not activity: self.error(f'No {gr_src.NAME} post found for id {id}', 404) elif activity.source != source.key: self.error( f'Activity {id} is owned by {activity.source}, not {source.key}', 403) activity_data = json_loads(activity.activity_json) # convert new extras to AS, merge into existing activity try: new_extras = getattr(gr_src, self.MERGE_METHOD)( request.get_data(as_text=True), activity_data) except ValueError as e: self.error(f"Scrape error: couldn't parse extras: {e}") activity.activity_json = json_dumps(activity_data) activity.put() extra_ids = ' '.join(c['id'] for c in new_extras) logger.info(f"Stored extras for activity {id}: {extra_ids}") return jsonify(new_extras)
def source_url(self, target_url): # determine which activity to use activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json.loads(self.entity.urls_to_activity) if urls_to_activity: try: activity = self.activities[urls_to_activity[target_url]] except KeyError: logging.warning( """\ Hit https://github.com/snarfed/bridgy/issues/237 KeyError! target url %s not in urls_to_activity: %s activities: %s""", target_url, urls_to_activity, self.activities) self.abort(ERROR_HTTP_RETURN_CODE) # generate source URL id = activity['id'] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL) # currently have problems with brid.gy's SSL cert. details: # https://github.com/snarfed/bridgy/issues/20 if (self.request.host_url.endswith('brid.gy') or self.request.host_url.endswith('brid-gy.appspot.com')): host_url = 'https://brid-gy.appspot.com' else: host_url = self.request.host_url path = [ host_url, self.entity.type, self.entity.source.get().SHORT_NAME, self.entity.source.string_id(), post_id ] if self.entity.type != 'post': # parse and add response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) reaction_id = response_id if self.entity.type in ('like', 'react', 'repost', 'rsvp'): response_id = response_id.split('_')[ -1] # extract responder user id path.append(response_id) if self.entity.type == 'react': path.append(reaction_id) return '/'.join(path)
def get_comment(self, comment_id, activity=None, **kwargs): """Uses the activity passed in the activity kwarg.""" if activity: for reply in activity.get('object', {}).get('replies', {}).get('items', []): parsed = util.parse_tag_uri(reply.get('id', '')) if parsed and parsed[1] == comment_id: return reply
def remove_bad_ids(objs, label): ret = [] for o in objs: id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '') if id and ':' in id[1]: logging.warning('Cowardly ignoring %s with bad id: %s', label, id[1]) else: ret.append(o) return ret
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event', ) source = util.load_source(self) if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities(fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) obj = activities[0].get('object') or activities[0] in_reply_to = util.get_first(obj, 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get( 'id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in source.RATE_LIMIT_HTTP_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def dispatch_request(self): logger.debug(f'Params: {list(request.values.items())}') g.TRANSIENT_ERROR_HTTP_CODES = ('400', '404') type = request.values.get('type') if type: assert type in ('event', ) source = g.source = util.load_source() if not source or source.status == 'disabled' or 'listen' not in source.features: logger.error('Source not found or disabled. Dropping task.') return '' logger.info( f'Source: {source.label()} {source.key_id()}, {source.bridgy_url()}' ) post_id = request.values['post_id'] source.updates = {} if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities(fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key_id()) if not activities or not activities[0]: logger.info(f'Post {post_id} not found.') return '' assert len(activities) == 1, activities activity = activities[0] activities = {activity['id']: activity} # STATE: propagate tasks created by backfeed() here get started before their Response entities get created/updated, so they fail with https://github.com/snarfed/bridgy/issues/237 , but that's a red herring, it's really that activities_json and urls_to_activity are empty # is poll transactional somehow, and this isn't? # no more transactional tasks. https://github.com/googleapis/python-tasks/issues/26 # they're still supported in the new "bundled services" thing, but that seems like a dead end. # https://groups.google.com/g/google-appengine/c/22BKInlWty0/m/05ObNEdsAgAJ self.backfeed(source, responses=activities, activities=activities) obj = activity.get('object') or activity in_reply_to = util.get_first(obj, 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get( 'id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) return 'OK'
def source_url(self, target_url): # parse the response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) if self.entity.type in ('like', 'repost', 'rsvp'): response_id = response_id.split('_')[-1] # determine which activity to use activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json.loads(self.entity.urls_to_activity) if urls_to_activity: try: activity = self.activities[urls_to_activity[target_url]] except KeyError: logging.warning("""\ Hit https://github.com/snarfed/bridgy/issues/237 KeyError! target url %s not in urls_to_activity: %s activities: %s""", target_url, urls_to_activity, self.activities) self.abort(ERROR_HTTP_RETURN_CODE) # generate source URL id = activity['id'] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id # prefer brid-gy.appspot.com to brid.gy because non-browsers (ie OpenSSL) # currently have problems with brid.gy's SSL cert. details: # https://github.com/snarfed/bridgy/issues/20 if (self.request.host_url.endswith('brid.gy') or self.request.host_url.endswith('brid-gy.appspot.com')): host_url = 'https://brid-gy.appspot.com' else: host_url = self.request.host_url path = [host_url, self.entity.type, self.entity.source.get().SHORT_NAME, self.entity.source.string_id(), post_id] if self.entity.type != 'post': path.append(response_id) return '/'.join(path)
def get_like(self, activity_user_id, activity_id, like_user_id, activity=None, **kwargs): """Uses the activity passed in the activity kwarg.""" if activity: for tag in activity.get('object', {}).get('tags', []): if tag.get('verb') == 'like': parsed = util.parse_tag_uri( tag.get('author', {}).get('id', '')) if parsed and parsed[1] == like_user_id: return tag
def source_url(self, target_url): # determine which activity to use try: activity = self.activities[0] if self.entity.urls_to_activity: urls_to_activity = json_loads(self.entity.urls_to_activity) if urls_to_activity: activity = self.activities[urls_to_activity[target_url]] except (KeyError, IndexError): logging.warning( """\ Hit https://github.com/snarfed/bridgy/issues/237 KeyError! target url %s not in urls_to_activity: %s activities: %s""", target_url, self.entity.urls_to_activity, self.activities) self.abort(util.ERROR_HTTP_RETURN_CODE) # generate source URL id = activity['id'] parsed = util.parse_tag_uri(id) post_id = parsed[1] if parsed else id parts = [ util.host_url(self), self.entity.type, self.source.SHORT_NAME, self.source.key.string_id(), post_id ] if self.entity.type != 'post': # parse and add response id. (we know Response key ids are always tag URIs) _, response_id = util.parse_tag_uri(self.entity.key.string_id()) reaction_id = response_id if self.entity.type in ('like', 'react', 'repost', 'rsvp'): response_id = response_id.split('_')[ -1] # extract responder user id parts.append(response_id) if self.entity.type == 'react': parts.append(reaction_id) return '/'.join(parts)
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event',) key = util.get_required_param(self, 'source_key') source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities( fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) in_reply_to = util.get_first(activities[0]['object'], 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get('id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in util.HTTP_RATE_LIMIT_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self): logging.debug('Params: %s', list(self.request.params.items())) type = self.request.get('type') if type: assert type in ('event', ) source = self.source = util.load_source(self) if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities(fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key_id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities activity = activities[0] activities = {activity['id']: activity} self.backfeed(source, responses=activities, activities=activities) obj = activity.get('object') or activity in_reply_to = util.get_first(obj, 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get( 'id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1])
def post(self, *args): source = self.auth() gr_src = self.gr_source() id = util.get_required_param(self, 'id') # validate request parsed_id = util.parse_tag_uri(id) if not parsed_id: self.abort(400, f'Expected id to be tag URI; got {id}') activity = Activity.get_by_id(id) if not activity: self.abort(404, f'No {gr_src.NAME} post found for id {id}') elif activity.source != source.key: self.abort( 403, f'Activity {id} is owned by {activity.source}, not {source.key}' ) activity_data = json_loads(activity.activity_json) # convert new reactions to AS, merge into existing activity try: new_reactions = gr_src.merge_scraped_reactions( self.request.text, activity_data) except ValueError as e: msg = "Couldn't parse scraped reactions: %s" % e logging.error(msg, stack_info=True) self.abort(400, msg) activity.activity_json = json_dumps(activity_data) activity.put() reaction_ids = ' '.join(r['id'] for r in new_reactions) logging.info(f"Stored reactions for activity {id}: {reaction_ids}") self.output(new_reactions)
def test_parse_tag_uri(self): self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com,2013:foo')) self.assertEquals(('x.com', 'foo'), util.parse_tag_uri('tag:x.com:foo')) self.assertEquals(None, util.parse_tag_uri('asdf'))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json_loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response(fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = str(id) > str(last_activity_id) if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json_dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) self.backfeed(source, responses, activities=activities) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException as e: if ('BadRequestError' in str(e.__class__) or 'Timeout' in str(e.__class__) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', stack_info=True) else: raise else: logging.info( 'skipping refetch h-feed. last-syndication-url %s, last-refetch %s', source.last_syndication_url, source.last_hfeed_refetch)
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort( 400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort( 400, 'Source %s %s not found' % (source_short_name, string_id)) elif (self.source.status == 'disabled' or ('listen' not in self.source.features and 'email' not in self.source.features)): self.abort( 400, 'Source %s is disabled for backfeed' % self.source.bridgy_path()) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj and not appengine_config.DEBUG: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except models.DisableSource as e: self.abort( 401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!" ) except ValueError as e: self.abort(400, '%s error:\n%s' % (self.source.GR_CLASS.NAME, e)) except Exception as e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) # temporary, trying to debug a flaky test failure # eg https://circleci.com/gh/snarfed/bridgy/769 if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) if self.source.is_blocked(obj): self.abort(410, 'That user is currently blocked') # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' url = obj.get('url', '') self.response.out.write( TEMPLATE.substitute({ 'refresh': (('<meta http-equiv="refresh" content="0;url=%s">' % url) if url else ''), 'url': url, 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers[ 'Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, 'Source %s %s not found' % (source_short_name, string_id)) elif self.source.status == 'disabled' or 'listen' not in self.source.features: self.abort(400, 'Source %s is disabled for backfeed' % self.source.bridgy_path()) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except models.DisableSource as e: self.abort(401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!") except Exception as e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write(TEMPLATE.substitute({ 'url': obj.get('url', ''), 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers['Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def get_activities_response(self, **kwargs): # TODO: use batch API to get photos, events, etc in one request # https://developers.facebook.com/docs/graph-api/making-multiple-requests try: resp = self.gr_source.get_activities_response(group_id=SELF, **kwargs) # if it's requesting one specific activity, then we're done if 'activity_id' in kwargs: return resp # also get uploaded photos manually since facebook sometimes collapses # multiple photos into albums, and the album post object won't have the # post content, comments, etc. from the individual photo posts. # http://stackoverflow.com/questions/12785120 # # TODO: save and use ETag for all of these extra calls photos = self.get_data(API_PHOTOS) # also get events and RSVPs # https://developers.facebook.com/docs/graph-api/reference/user/events/ # https://developers.facebook.com/docs/graph-api/reference/event#edges # TODO: also fetch and use API_USER_RSVPS_DECLINED user_rsvps = self.get_data(API_USER_RSVPS) # have to re-fetch the events because the user rsvps response doesn't # include the event description, which we need for original post links. events = [self.gr_source.urlopen(API_EVENT % r['id']) for r in user_rsvps if r.get('id')] # also, only process events that the user is the owner of. avoids (but # doesn't prevent) processing big non-indieweb events with tons of # attendees that put us over app engine's instance memory limit. details: # https://github.com/snarfed/bridgy/issues/77 events_and_rsvps = [(e, self.get_data(API_EVENT_RSVPS % e['id'])) for e in events if e.get('owner', {}).get('id') == self.key.id()] except urllib2.HTTPError as e: # Facebook API error details: # https://developers.facebook.com/docs/graph-api/using-graph-api/#receiving-errorcodes # https://developers.facebook.com/docs/reference/api/errors/ exc_type, _, exc_traceback = sys.exc_info() body = e.read() exc_copy = exc_type(e.filename, e.code, e.msg, e.hdrs, cStringIO.StringIO(body)) try: body_json = json.loads(body) except: logging.exception('Non-JSON response body: %s', body) # response isn't JSON. ignore and re-raise the original exception raise exc_type, exc_copy, exc_traceback error = body_json.get('error', {}) if error.get('code') in (102, 190): subcode = error.get('error_subcode') if subcode == 458: # revoked raise models.DisableSource() elif subcode in (463, 460): # expired, changed password # ask the user to reauthenticate self.gr_source.create_notification( self.key.id(), "Brid.gy's access to your account has expired. Click here to renew it now!", 'https://www.brid.gy/facebook/start') raise models.DisableSource() # other error. re-raise original exception raise exc_type, exc_copy, exc_traceback # add photos. they show up as both a post and a photo, each with a separate # id. the post's object_id field points to the photo's id. de-dupe by # switching the post to use the fb_object_id when it's provided. activities = resp.setdefault('items', []) activities_by_fb_id = {} for activity in activities: obj = activity.get('object', {}) fb_id = obj.get('fb_object_id') if not fb_id: continue activities_by_fb_id[fb_id] = activity for x in activity, obj: parsed = util.parse_tag_uri(x.get('id', '')) if parsed: _, orig_id = parsed x['id'] = self.gr_source.tag_uri(fb_id) x['url'] = x.get('url', '').replace(orig_id, fb_id) # merge comments and likes from existing photo objects, and add new ones. for photo in photos: photo_activity = self.gr_source.post_to_activity(photo) existing = activities_by_fb_id.get(photo.get('id')) if existing: existing['object'].setdefault('replies', {}).setdefault('items', []).extend( photo_activity['object'].get('replies', {}).get('items', [])) existing['object'].setdefault('tags', []).extend( [t for t in photo_activity['object'].get('tags', []) if t.get('verb') == 'like']) else: activities.append(photo_activity) # add events activities += [self.gr_source.event_to_activity(e, rsvps=r) for e, r in events_and_rsvps] # TODO: remove once we're confident in our id parsing. (i'm going to canary # with just a few users before i do it for everyone.) # # discard objects with ids with colons in them. Background: # https://github.com/snarfed/bridgy/issues/305 def remove_bad_ids(objs, label): ret = [] for o in objs: id = util.parse_tag_uri(o.get('id') or o.get('object', {}).get('id') or '') if id and ':' in id[1]: logging.warning('Cowardly ignoring %s with bad id: %s', label, id[1]) else: ret.append(o) return ret resp['items'] = remove_bad_ids(activities, 'activity') for activity in resp['items']: obj = activity.get('object', {}) obj['tags'] = remove_bad_ids(obj.setdefault('tags', []), 'tag/like') replies = obj.get('replies', {}) items = replies.get('items') if items: replies['items'] = remove_bad_ids(items, 'comment') replies['totalItems'] = len(replies['items']) return util.trim_nulls(resp)
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}) self.backfeed(source, responses, activities=activities) source.updates.update({'last_polled': source.last_poll_attempt, 'poll_status': 'ok'}) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info('refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
class ItemHandler(webapp2.RequestHandler): """Fetches a post, repost, like, or comment and serves it as mf2 HTML or JSON. """ handle_exception = handlers.handle_exception source = None VALID_ID = re.compile(r'^[\w.+:@-]+$') def head(self, *args): """Return an empty 200 with no caching directives.""" def get_item(self, id): """Fetches and returns an object from the given source. To be implemented by subclasses. Args: source: bridgy.Source subclass id: string Returns: ActivityStreams object dict """ raise NotImplementedError() def get_title(self, obj): """Returns the string to be used in the <title> tag. Args: obj: ActivityStreams object """ return obj.get('title') or obj.get('content') or 'Bridgy Response' def get_post(self, id, **kwargs): """Fetch a post. Args: id: string, site-specific post id is_event: bool kwargs: passed through to get_activities Returns: ActivityStreams object dict """ try: posts = self.source.get_activities(activity_id=id, user_id=self.source.key.id(), **kwargs) if posts: return posts[0] logging.warning('Source post %s not found', id) except Exception as e: util.interpret_http_exception(e) def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort( 400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort( 400, 'Source %s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write( TEMPLATE.substitute({ 'url': obj.get('url', ''), 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers[ 'Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def dispatch_request(self, site, key_id, **kwargs): """Handle HTTP request.""" source_cls = models.sources.get(site) if not source_cls: error( f"Source type '{site}' not found. Known sources: {[s for s in models.sources.keys() if s]}" ) self.source = source_cls.get_by_id(key_id) if not self.source: error(f'Source {site} {key_id} not found') elif (self.source.status == 'disabled' or 'listen' not in self.source.features): error( f'Source {self.source.bridgy_path()} is disabled for backfeed') format = request.values.get('format', 'html') if format not in ('html', 'json'): error(f'Invalid format {format}, expected html or json') for id in kwargs.values(): if not self.VALID_ID.match(id): error(f'Invalid id {id}', 404) try: obj = self.get_item(**kwargs) except models.DisableSource: error( "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!", 401) except ValueError as e: error(f'{self.source.GR_CLASS.NAME} error: {e}') if not obj: error(f'Not found: {site}:{key_id} {kwargs}', 404) if self.source.is_blocked(obj): error('That user is currently blocked', 410) # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, request) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: urls = author.get('properties', {}).setdefault('url', []) try: silo_url = self.source.gr_source.user_url(parsed[1]) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) except NotImplementedError: # from gr_source.user_url() pass # write the response! if format == 'html': url = obj.get('url', '') return TEMPLATE.substitute({ 'refresh': (f'<meta http-equiv="refresh" content="0;url={url}">' if url else ''), 'url': url, 'body': microformats2.json_to_html(mf2_json), 'title': obj.get('title') or obj.get('content') or 'Bridgy Response', }) elif format == 'json': return mf2_json
# use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write(TEMPLATE.substitute({ 'url': obj.get('url', ''), 'body': microformats2.json_to_html(mf2_json), 'title': obj.get('title', obj.get('content', 'Bridgy Response')), }))