def test_is_connection_failure(self): for e in (socket.timeout(), socket.error(), requests.ConnectionError(), httplib.NotConnected(), urllib2.URLError(socket.gaierror('foo bar')), urllib3.exceptions.TimeoutError()): assert util.is_connection_failure(e), e for e in (None, 3, 'asdf', IOError(), httplib.HTTPException('unknown'), urllib2.URLError('asdf'), urllib2.HTTPError('url', 403, 'msg', {}, None), ): assert not util.is_connection_failure(e), e
def handle_exception(self, e, debug): """A webapp2 exception handler that propagates HTTP exceptions into the response. Use this as a :meth:`webapp2.RequestHandler.handle_exception()` method by adding this line to your handler class definition:: handle_exception = handlers.handle_exception I originally tried to put this in a :class:`webapp2.RequestHandler` subclass, but it gave me this exception:: File ".../webapp2-2.5.1/webapp2_extras/local.py", line 136, in _get_current_object raise RuntimeError('no object bound to %s' % self.__name__) RuntimeError: no object bound to app These are probably related: * http://eemyop.blogspot.com/2013/05/digging-around-in-webapp2-finding-out.html * http://code.google.com/p/webapp-improved/source/detail?r=d962ac4625ce3c43a3e59fd7fc07daf8d7b7c46a """ code, body = util.interpret_http_exception(e) if code: self.response.set_status(int(code)) self.response.write('HTTP Error %s: %s' % (code, body)) elif util.is_connection_failure(e): self.response.set_status(502) self.response.write('Upstream server request failed: %s' % e) else: raise
def post(self): source = self.load_source(param='key') kind = source.key.kind() feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe().decode(), 'callback': self.request.get('callback'), }) # Blogger don't support redirect_url() yet if kind == 'Blogger': return self.redirect('/blogger/delete/start?state=%s' % state) path = ('/reddit/callback' if kind == 'Reddit' else '/wordpress/add' if kind == 'WordPress' else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if kind == 'Twitter': kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = source.OAUTH_START_HANDLER.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = str(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def post(self): logging.debug('Params: %s', self.request.params) key = util.get_required_param(self, 'source_key') source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: activities = source.get_activities( fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities: logging.info('Post %s not found.', post_id) return assert len(activities) == 1 self.backfeed(source, activities={activities[0]['id']: activities[0]}) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in util.HTTP_RATE_LIMIT_CODES or code == '400' or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self): source = self.load_source(param='key') module = self.OAUTH_MODULES[source.key.kind()] feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe(), 'callback': self.request.get('callback'), }) # Blogger don't support redirect_url() yet if module is oauth_blogger_v2: return self.redirect('/blogger/delete/start?state=%s' % state) path = ('/instagram/callback' if module is indieauth else '/wordpress/add' if module is oauth_wordpress_rest else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if module is oauth_twitter: kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = module.StartHandler.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = unicode(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, 'Source %s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise
def post(self): try: self.redirect(self.redirect_url(state=util.get_required_param(self, 'token'))) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]: self.messages.add("Couldn't fetch your web site: %s" % e) return self.redirect('/') raise
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code == '401' or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in util.HTTP_RATE_LIMIT_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) if source.is_beta_user(): util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) / 100 == 5) or (code == '400' and isinstance(source, flickr.Flickr)) or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self): ia_start = util.oauth_starter(indieauth.StartHandler).to('/instagram/callback')( self.request, self.response) try: self.redirect(ia_start.redirect_url(me=util.get_required_param(self, 'user_url'))) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]: self.messages.add("Couldn't fetch your web site: %s" % e) return self.redirect('/') raise
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning('Rate limited. Marking as error and finishing. %s', e) source.updates.update({'poll_status': 'error', 'rate_limited': True}) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event', ) source = util.load_source(self) if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities(fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) obj = activities[0].get('object') or activities[0] in_reply_to = util.get_first(obj, 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get( 'id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in source.RATE_LIMIT_HTTP_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def dispatch_request(self): token = request.form['token'] try: to_url = self.redirect_url(state=token) except Exception as e: if util.is_connection_failure(e) or util.interpret_http_exception( e)[0]: flash(f"Couldn't fetch your web site: {e}") return redirect('/') raise return redirect(to_url)
def post(self): logging.debug('Params: %s', self.request.params) type = self.request.get('type') if type: assert type in ('event',) key = util.get_required_param(self, 'source_key') source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) post_id = util.get_required_param(self, 'post_id') source.updates = {} try: if type == 'event': activities = [source.gr_source.get_event(post_id)] else: activities = source.get_activities( fetch_replies=True, fetch_likes=True, fetch_shares=True, activity_id=post_id, user_id=source.key.id()) if not activities or not activities[0]: logging.info('Post %s not found.', post_id) return assert len(activities) == 1, activities self.backfeed(source, activities={activities[0]['id']: activities[0]}) in_reply_to = util.get_first(activities[0]['object'], 'inReplyTo') if in_reply_to: parsed = util.parse_tag_uri(in_reply_to.get('id', '')) # TODO: fall back to url if parsed: util.add_discover_task(source, parsed[1]) except Exception, e: code, body = util.interpret_http_exception(e) if (code and (code in util.HTTP_RATE_LIMIT_CODES or code in ('400', '404') or int(code) / 100 == 5) or util.is_connection_failure(e)): logging.error('API call failed; giving up. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise
def post(self): key = ndb.Key(urlsafe=util.get_required_param(self, 'key')) module = self.OAUTH_MODULES[key.kind()] feature = util.get_required_param(self, 'feature') state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': key.urlsafe(), 'callback': self.request.get('callback'), }) # Google+ and Blogger don't support redirect_url() yet if module is oauth_googleplus: return self.redirect('/googleplus/delete/start?state=%s' % state) if module is oauth_blogger_v2: return self.redirect('/blogger/delete/start?state=%s' % state) source = key.get() path = ('/instagram/callback' if module is indieauth else '/wordpress/add' if module is oauth_wordpress_rest else '/%s/delete/finish' % source.SHORT_NAME) kwargs = {} if module is oauth_twitter: kwargs['access_type'] = 'read' if feature == 'listen' else 'write' handler = module.StartHandler.to(path, **kwargs)(self.request, self.response) try: self.redirect(handler.redirect_url(state=state)) except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = unicode(e) if code: self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body)) self.redirect(source.bridgy_url(self)) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort( 400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort( 400, 'Source %s %s not found' % (source_short_name, string_id)) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except Exception, e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME)
def post(self, *path_args): logging.debug("Params: %s", self.request.params) key = self.request.params["source_key"] source = ndb.Key(urlsafe=key).get() if not source or source.status == "disabled" or "listen" not in source.features: logging.error("Source not found or disabled. Dropping task.") return logging.info("Source: %s %s, %s", source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params["last_polled"] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning("duplicate poll task! deferring to the other task.") return logging.info( "Last poll: %s/log?start_time=%s&key=%s", self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe(), ) # mark this source as polling source.updates = {"poll_status": "polling", "last_poll_attempt": util.now_fn()} source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception, e: source.updates["poll_status"] = "error" code, body = util.interpret_http_exception(e) if code == "401" or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning("Disabling source due to: %s" % e, exc_info=True) source.updates.update({"status": "disabled", "poll_status": "ok"}) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning("Rate limited. Marking as error and finishing. %s", e) source.updates["rate_limited"] = True elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error("API call failed. Marking as error and finishing. %s: %s\n%s", code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def background_handle_exception(e): """Common exception handler for background tasks. Catches failed outbound HTTP requests and returns HTTP 304. """ if isinstance(e, HTTPException): # raised by this app itself, pass it through return str(e), e.code transients = getattr(g, 'TRANSIENT_ERROR_HTTP_CODES', ()) source = getattr(g, 'source', None) if source: transients += source.RATE_LIMIT_HTTP_CODES + source.TRANSIENT_ERROR_HTTP_CODES code, body = util.interpret_http_exception(e) if ((code and int(code) // 100 == 5) or code in transients or util.is_connection_failure(e)): logger.error(f'Marking as error and finishing. {code}: {body}\n{e}') return '', util.ERROR_HTTP_RETURN_CODE raise e
def delete_start(): source = util.load_source() kind = source.key.kind() feature = request.form['feature'] state = util.encode_oauth_state({ 'operation': 'delete', 'feature': feature, 'source': source.key.urlsafe().decode(), 'callback': request.values.get('callback'), }) # Blogger don't support redirect_url() yet if kind == 'Blogger': return redirect(f'/blogger/delete/start?state={state}') path = ('/reddit/callback' if kind == 'Reddit' else '/wordpress/add' if kind == 'WordPress' else f'/{source.SHORT_NAME}/delete/finish') kwargs = {} if kind == 'Twitter': kwargs['access_type'] = 'read' if feature == 'listen' else 'write' try: return redirect(source.OAUTH_START(path).redirect_url(state=state)) except werkzeug.exceptions.HTTPException: # raised by us, probably via self.error() raise except Exception as e: code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = '-' body = str(e) if code: flash(f'{source.GR_CLASS.NAME} API error {code}: {body}') return redirect(source.bridgy_url()) else: raise
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) if urlparse.urlparse(self.target_url).path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.') # create BlogWebmention entity id = u'%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json_loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response(fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = str(id) > str(last_activity_id) if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json_dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) self.backfeed(source, responses, activities=activities) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException as e: if ('BadRequestError' in str(e.__class__) or 'Timeout' in str(e.__class__) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', stack_info=True) else: raise else: logging.info( 'skipping refetch h-feed. last-syndication-url %s, last-refetch %s', source.last_syndication_url, source.last_hfeed_refetch)
def _run(self): """Returns CreationResult on success, None otherwise.""" logging.info('Params: %s', self.request.params.items()) assert self.PREVIEW in (True, False) # parse and validate target URL try: parsed = urlparse.urlparse(self.target_url()) except BaseException: return self.error('Could not parse target URL %s' % self.target_url()) domain = parsed.netloc path_parts = parsed.path.rsplit('/', 1) source_cls = SOURCE_NAMES.get(path_parts[-1]) if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls): return self.error( 'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}') elif source_cls == GooglePlusPage: return self.error('Sorry, %s is not yet supported.' % source_cls.GR_CLASS.NAME) # resolve source URL url, domain, ok = util.get_webmention_target( self.source_url(), replace_test_domains=False) # show nice error message if they're trying to publish a silo post if domain in SOURCE_DOMAINS: return self.error( "Looks like that's a %s URL. Try one from your web site instead!" % SOURCE_DOMAINS[domain].GR_CLASS.NAME) elif not ok: return self.error('Unsupported source URL %s' % url) elif not domain: return self.error('Could not parse source URL %s' % url) # look up source by domain domain = domain.lower() sources = source_cls.query().filter(source_cls.domains == domain).fetch(100) if not sources: return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." % {'type': source_cls.GR_CLASS.NAME, 'domain': domain}) current_url = '' for source in sources: logging.info('Source: %s , features %s, status %s, poll status %s', source.bridgy_url(self), source.features, source.status, source.poll_status) if source.status != 'disabled' and 'publish' in source.features: # use a source that has a domain_url matching the url provided. # look through each source to find the one with the closest match. schemeless_url = util.schemeless(url.lower()).strip('/') for domain_url in source.domain_urls: schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/') if (schemeless_url.startswith(schemeless_domain_url) and len(domain_url) > len(current_url)): self.source = source current_url = domain_url if not self.source: return self.error( 'Publish is not enabled for your account. Please visit https://brid.gy and sign up!') content_param = 'bridgy_%s_content' % self.source.SHORT_NAME if content_param in self.request.params: return self.error('The %s parameter is not supported' % content_param) # show nice error message if they're trying to publish their home page for domain_url in self.source.domain_urls: domain_url_parts = urlparse.urlparse(domain_url) source_url_parts = urlparse.urlparse(self.source_url()) if (source_url_parts.netloc == domain_url_parts.netloc and source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and not source_url_parts.query): return self.error( "Looks like that's your home page. Try one of your posts instead!") # done with the sanity checks, ready to fetch the source url. create the # Publish entity so we can store the result. entity = self.get_or_add_publish_entity(url) if (entity.status == 'complete' and entity.type != 'preview' and not self.PREVIEW and not appengine_config.DEBUG): return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!") self.entity = entity # fetch source page resp = self.fetch_mf2(url) if not resp: return self.fetched, data = resp # find rel-shortlink, if any # http://microformats.org/wiki/rel-shortlink # https://github.com/snarfed/bridgy/issues/173 soup = util.beautifulsoup_parse(self.fetched.text) shortlinks = (soup.find_all('link', rel='shortlink') + soup.find_all('a', rel='shortlink') + soup.find_all('a', class_='shortlink')) if shortlinks: self.shortlink = shortlinks[0]['href'] # loop through each item and its children and try to preview/create it. if # it fails, try the next one. break after the first one that works. result = None types = set() queue = collections.deque(data.get('items', [])) while queue: item = queue.popleft() item_types = set(item.get('type')) if 'h-feed' in item_types and 'h-entry' not in item_types: queue.extend(item.get('children', [])) continue elif not item_types & PUBLISHABLE_TYPES: continue try: result = self.attempt_single_item(item) if self.entity.published: break if result.abort: if result.error_plain: self.error(result.error_plain, html=result.error_html, data=item) return # try the next item for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like', 'like-of', 'in-reply-to'): if embedded in item.get('properties', []): item_types.add(embedded) logging.info( 'Object type(s) %s not supported; error=%s; trying next.', item_types, result.error_plain) types = types.union(item_types) queue.extend(item.get('children', [])) except BaseException, e: code, body = util.interpret_http_exception(e) mail = True if (not code or code == 500) and util.is_connection_failure(e): code = 502 mail=False msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e) return self.error(msg, status=code or 500, mail=mail)
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache, ) etag = resp.get("etag") # used later user_activities = resp.get("items", []) # these map ids to AS objects responses = {a["id"]: a for a in links} activities = {a["id"]: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates["last_activity_id"] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates["last_activities_cache_json"] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids} ) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info("Found %d public activities: %s", len(public), public.keys()) logging.info("Found %d private activities: %s", len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls([a.get("published") for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published)) source.updates["recent_private_posts"] = len( [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post] ) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get("object") or activity # handle user mentions user_id = source.user_tag_id() if obj.get("author", {}).get("id") != user_id: for tag in obj.get("tags", []): urls = tag.get("urls") if tag.get("objectType") == "person" and tag.get("id") == user_id and urls: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) activity["mentions"].update(u.get("value") for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get("attachments", []): if ( att.get("objectType") in ("note", "article") and att.get("author", {}).get("id") == source.user_tag_id() ): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get("replies", {}).get("items", []) tags = obj.get("tags", []) likes = [t for t in tags if Response.get_type(t) == "like"] reactions = [t for t in tags if Response.get_type(t) == "react"] reposts = [t for t in tags if Response.get_type(t) == "repost"] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get("id") if not id: logging.error("Skipping response without id: %s", json.dumps(resp, indent=2)) continue resp.setdefault("activities", []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp) resp["activities"].extend(existing.get("activities", [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen["id"] resp = responses.get(id) if resp and not source.gr_source.activity_changed(seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop("activities", []) if not activities and resp_type == "post": activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if "originals" not in activity or "mentions" not in activity: activity["originals"], activity["mentions"] = original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds, ) targets = original_post_discovery.targets_for_response( resp, originals=activity["originals"], mentions=activity["mentions"] ) if targets: logging.info( "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets) ) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t) too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...") # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response( id=id, source=source.key, activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get("originals", []), ) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses) source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"}) if etag and etag != source.last_activities_etag: source.updates["last_activities_etag"] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info("refetching h-feed for source %s", source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates["last_hfeed_refetch"] = now if relationships: logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if isinstance( e, (datastore_errors.BadRequestError, datastore_errors.Timeout) ) or util.is_connection_failure(e): logging.info("Timeout while repropagating responses.", exc_info=True) else: raise
def post(self, source_short_name): logging.info('Params: %s', list(self.request.params.items())) # strip fragments from source and target url self.source_url = urllib.parse.urldefrag( util.get_required_param(self, 'source'))[0] self.target_url = urllib.parse.urldefrag( util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query().filter( source_cls.domains == domain).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query().filter( source_cls.domains.IN(domains)).filter( source_cls.features == 'webmention').filter( source_cls.status == 'enabled').get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error( 'Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): return self.error( '%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe().decode()) # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: return self.error( 'Could not find target URL %s in source page %s' % (self.target_url, resp.url), data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][ 0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, stack_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, report=False) elif code or body: return self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json_dumps(self.entity.published))
class Poll(webapp2.RequestHandler): """Task handler that fetches and processes new responses from a single source. Request parameters: source_key: string key of source entity last_polled: timestamp, YYYY-MM-DD-HH-MM-SS Inserts a propagate task for each response that hasn't been seen before. """ def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime( util.POLL_TASK_DATETIME_FORMAT): logging.warning( 'duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s/log?start_time=%s&key=%s', self.request.host_url, calendar.timegm(source.last_poll_attempt.utctimetuple()), source.key.urlsafe()) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except models.DisableSource: # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. source.updates['status'] = 'disabled' logging.warning('Disabling source!') except: source.updates['poll_status'] = 'error' raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform( .8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect() def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps({ k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids }) # Cache to make sure we only fetch the author's h-feed(s) the # first time we see it fetched_hfeeds = set() # narrow down to just public activities public = {} private = {} for id, activity in activities.items(): (public if source.is_activity_public(activity) else private)[id] = activity logging.info('Found %d public activities: %s', len(public), public.keys()) logging.info('Found %d private activities: %s', len(private), private.keys()) last_public_post = (source.last_public_post or util.EPOCH).isoformat() public_published = util.trim_nulls( [a.get('published') for a in public.values()]) if public_published: max_published = max(public_published) if max_published > last_public_post: last_public_post = max_published source.updates['last_public_post'] = \ util.as_utc(util.parse_iso8601(max_published)) source.updates['recent_private_posts'] = \ len([a for a in private.values() if a.get('published', util.EPOCH_ISO) > last_public_post]) # # Step 2: extract responses, store their activities in response['activities'] # # WARNING: this creates circular references in link posts found by search # queries in step 1, since they are their own activity. We use # prune_activity() and prune_response() in step 4 to remove these before # serializing to JSON. # for id, activity in public.items(): obj = activity.get('object') or activity # handle user mentions user_id = source.user_tag_id() if obj.get('author', {}).get('id') != user_id: for tag in obj.get('tags', []): urls = tag.get('urls') if tag.get('objectType') == 'person' and tag.get( 'id') == user_id and urls: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) activity['mentions'].update( u.get('value') for u in urls) responses[id] = activity break # handle quote mentions for att in obj.get('attachments', []): if (att.get('objectType') in ('note', 'article') and att.get( 'author', {}).get('id') == source.user_tag_id()): # now that we've confirmed that one exists, OPD will dig # into the actual attachments if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) responses[id] = activity break # extract replies, likes, reactions, reposts, and rsvps replies = obj.get('replies', {}).get('items', []) tags = obj.get('tags', []) likes = [t for t in tags if Response.get_type(t) == 'like'] reactions = [t for t in tags if Response.get_type(t) == 'react'] reposts = [t for t in tags if Response.get_type(t) == 'repost'] rsvps = Source.get_rsvps_from_event(obj) # coalesce responses. drop any without ids for resp in replies + likes + reactions + reposts + rsvps: id = resp.get('id') if not id: logging.error('Skipping response without id: %s', json.dumps(resp, indent=2)) continue resp.setdefault('activities', []).append(activity) # when we find two responses with the same id, the earlier one may have # come from a link post or user mention, and this one is probably better # since it probably came from the user's activity, so prefer this one. # background: https://github.com/snarfed/bridgy/issues/533 existing = responses.get(id) if existing: if source.gr_source.activity_changed(resp, existing, log=True): logging.warning( 'Got two different versions of same response!\n%s\n%s', existing, resp) resp['activities'].extend(existing.get('activities', [])) responses[id] = resp # # Step 3: filter out responses we've already seen # # seen responses (JSON objects) for each source are stored in its entity. unchanged_responses = [] if source.seen_responses_cache_json: for seen in json.loads(source.seen_responses_cache_json): id = seen['id'] resp = responses.get(id) if resp and not source.gr_source.activity_changed( seen, resp, log=True): unchanged_responses.append(seen) del responses[id] # # Step 4: store new responses and enqueue propagate tasks # pruned_responses = [] for id, resp in responses.items(): resp_type = Response.get_type(resp) activities = resp.pop('activities', []) if not activities and resp_type == 'post': activities = [resp] too_long = set() urls_to_activity = {} for i, activity in enumerate(activities): # we'll usually have multiple responses for the same activity, and the # objects in resp['activities'] are shared, so cache each activity's # discovered webmention targets inside its object. if 'originals' not in activity or 'mentions' not in activity: activity['originals'], activity['mentions'] = \ original_post_discovery.discover( source, activity, fetch_hfeed=True, include_redirect_sources=False, already_fetched_hfeeds=fetched_hfeeds) targets = original_post_discovery.targets_for_response( resp, originals=activity['originals'], mentions=activity['mentions']) if targets: logging.info('%s has %d webmention target(s): %s', activity.get('url'), len(targets), ' '.join(targets)) for t in targets: if len(t) <= _MAX_STRING_LENGTH: urls_to_activity[t] = i else: logging.warning( 'Giving up on target URL over %s chars! %s', _MAX_STRING_LENGTH, t) too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...') # store/update response entity. the prune_*() calls are important to # remove circular references in link responses, which are their own # activities. details in the step 2 comment above. pruned_response = util.prune_response(resp) pruned_responses.append(pruned_response) resp_entity = Response(id=id, source=source.key, activities_json=[ json.dumps( util.prune_activity(a, source)) for a in activities ], response_json=json.dumps(pruned_response), type=resp_type, unsent=list(urls_to_activity.keys()), failed=list(too_long), original_posts=resp.get('originals', [])) if urls_to_activity and len(activities) > 1: resp_entity.urls_to_activity = json.dumps(urls_to_activity) resp_entity.get_or_save(source) # update cache if pruned_responses: source.updates['seen_responses_cache_json'] = json.dumps( pruned_responses + unchanged_responses) source.updates.update({ 'last_polled': source.last_poll_attempt, 'poll_status': 'ok' }) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Step 5. possibly refetch updated syndication urls # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info( 'refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def get(self, type, source_short_name, string_id, *ids): source_cls = models.sources.get(source_short_name) if not source_cls: self.abort(400, "Source type '%s' not found. Known sources: %s" % (source_short_name, filter(None, models.sources.keys()))) self.source = source_cls.get_by_id(string_id) if not self.source: self.abort(400, 'Source %s %s not found' % (source_short_name, string_id)) elif self.source.status == 'disabled' or 'listen' not in self.source.features: self.abort(400, 'Source %s is disabled for backfeed' % self.source.bridgy_path()) format = self.request.get('format', 'html') if format not in ('html', 'json'): self.abort(400, 'Invalid format %s, expected html or json' % format) for id in ids: if not self.VALID_ID.match(id): self.abort(404, 'Invalid id %s' % id) label = '%s:%s %s %s' % (source_short_name, string_id, type, ids) cache_key = 'H ' + label obj = memcache.get(cache_key) if obj: logging.info('Using cached object for %s', label) else: logging.info('Fetching %s', label) try: obj = self.get_item(*ids) except models.DisableSource as e: self.abort(401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!") except Exception as e: # pass through all API HTTP errors if we can identify them code, body = util.interpret_http_exception(e) if not code and util.is_connection_failure(e): code = 503 body = str(e) if code: self.response.status_int = int(code) self.response.headers['Content-Type'] = 'text/plain' self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body)) return else: raise memcache.set(cache_key, obj, time=CACHE_TIME) if not obj: self.abort(404, label) # use https for profile pictures so we don't cause SSL mixed mode errors # when serving over https. author = obj.get('author', {}) image = author.get('image', {}) url = image.get('url') if url: image['url'] = util.update_scheme(url, self) mf2_json = microformats2.object_to_json(obj, synthesize_content=False) # try to include the author's silo profile url author = first_props(mf2_json.get('properties', {})).get('author', {}) author_uid = first_props(author.get('properties', {})).get('uid', '') if author_uid: parsed = util.parse_tag_uri(author_uid) if parsed: silo_url = self.source.gr_source.user_url(parsed[1]) urls = author.get('properties', {}).setdefault('url', []) if silo_url not in microformats2.get_string_urls(urls): urls.append(silo_url) # write the response! self.response.headers['Access-Control-Allow-Origin'] = '*' if format == 'html': self.response.headers['Content-Type'] = 'text/html; charset=utf-8' self.response.out.write(TEMPLATE.substitute({ 'url': obj.get('url', ''), 'body': microformats2.json_to_html(mf2_json), 'title': self.get_title(obj), })) elif format == 'json': self.response.headers['Content-Type'] = 'application/json; charset=utf-8' self.response.out.write(json.dumps(mf2_json, indent=2))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) try: # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} except Exception, e: code, body = util.interpret_http_exception(e) if code == '401': msg = 'Unauthorized error: %s' % e logging.warning(msg, exc_info=True) source.updates['poll_status'] = 'ok' raise models.DisableSource(msg) elif code in util.HTTP_RATE_LIMIT_CODES: logging.warning( 'Rate limited. Marking as error and finishing. %s', e) source.updates.update({ 'poll_status': 'error', 'rate_limited': True }) return elif (code and int(code) / 100 == 5) or util.is_connection_failure(e): logging.error( 'API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(ERROR_HTTP_RETURN_CODE) else: raise
def post(self, *path_args): logging.debug('Params: %s', self.request.params) key = self.request.params['source_key'] source = ndb.Key(urlsafe=key).get() if not source or source.status == 'disabled' or 'listen' not in source.features: logging.error('Source not found or disabled. Dropping task.') return logging.info('Source: %s %s, %s', source.label(), source.key.string_id(), source.bridgy_url(self)) last_polled = self.request.params['last_polled'] if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT): logging.warning('duplicate poll task! deferring to the other task.') return logging.info('Last poll: %s', self._last_poll_url(source)) # mark this source as polling source.updates = { 'poll_status': 'polling', 'last_poll_attempt': util.now_fn(), 'rate_limited': False, } source = models.Source.put_updates(source) source.updates = {} try: self.poll(source) except Exception as e: source.updates['poll_status'] = 'error' code, body = util.interpret_http_exception(e) if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource): # the user deauthorized the bridgy app, so disable this source. # let the task complete successfully so that it's not retried. logging.warning('Disabling source due to: %s' % e, exc_info=True) source.updates.update({ 'status': 'disabled', 'poll_status': 'ok', }) body = '%s\nLast poll: %s' % (source.bridgy_url(self), self._last_poll_url(source)) elif code in source.RATE_LIMIT_HTTP_CODES: logging.info('Rate limited. Marking as error and finishing. %s', e) source.updates['rate_limited'] = True elif ((code and int(code) // 100 == 5) or code in source.TRANSIENT_ERROR_HTTP_CODES or util.is_connection_failure(e)): logging.error('API call failed. Marking as error and finishing. %s: %s\n%s', code, body, e) self.abort(util.ERROR_HTTP_RETURN_CODE) else: raise finally: source = models.Source.put_updates(source) # add new poll task. randomize task ETA to within +/- 20% to try to spread # out tasks and prevent thundering herds. task_countdown = source.poll_period().total_seconds() * random.uniform(.8, 1.2) util.add_poll_task(source, countdown=task_countdown) # feeble attempt to avoid hitting the instance memory limit source = None gc.collect()
def post(self, source_short_name): logging.info('Params: %self', self.request.params.items()) # strip fragments from source and target url self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0] self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: return self.error('Could not parse target URL %s' % self.target_url) # look up source by domain source_cls = models.sources[source_short_name] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1].get('rels', {}).get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: return self.error( 'Could not find %s account for %s. Is it registered with Bridgy?' % (source_cls.GR_CLASS.NAME, domain)) # check that the target URL path is supported target_path = urlparse.urlparse(self.target_url).path if target_path in ('', '/'): return self.error('Home page webmentions are not currently supported.', status=202) for pattern in self.source.PATH_BLACKLIST: if pattern.match(target_path): return self.error('%s webmentions are not supported for URL path: %s' % (self.source.GR_CLASS.NAME, target_path), status=202) # create BlogWebmention entity id = '%s %s' % (self.source_url, self.target_url) self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported self.response.write(self.entity.published) return logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe()) # fetch source page resp = self.fetch_mf2(self.source_url) if not resp: return self.fetched, data = resp item = self.find_mention_item(data.get('items', [])) if not item: return self.error('Could not find target URL %s in source page %s' % (self.target_url, self.fetched.url), data=data, log_exception=False) # default author to target domain author_name = domain author_url = 'http://%s/' % domain # extract author name and URL from h-card, if any props = item['properties'] author = first_value(props, 'author') if author: if isinstance(author, basestring): author_name = author else: author_props = author.get('properties', {}) author_name = first_value(author_props, 'name') author_url = first_value(author_props, 'url') # if present, u-url overrides source url u_url = first_value(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += ' <br /> <a href="%s">via %s</a>' % ( source_url, util.domain_from_link(source_url)) # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = 'Error: %s %s; %s' % (code, e, body) if code == '401': logging.warning('Disabling source due to: %s' % e, exc_info=True) self.source.status = 'disabled' self.source.put() return self.error(msg, status=code, mail=self.source.is_beta_user()) elif code == '404': # post is gone return self.error(msg, status=code, mail=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False) elif code or body: return self.error(msg, status=code, mail=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() self.response.write(json.dumps(self.entity.published))
def poll(self, source): """Actually runs the poll. Stores property names and values to update in source.updates. """ if source.last_activities_etag or source.last_activity_id: logging.debug('Using ETag %s, last activity id %s', source.last_activities_etag, source.last_activity_id) # # Step 1: fetch activities: # * posts by the user # * search all posts for the user's domain URLs to find links # cache = util.CacheDict() if source.last_activities_cache_json: cache.update(json.loads(source.last_activities_cache_json)) # search for links first so that the user's activities and responses # override them if they overlap links = source.search_for_links() # this user's own activities (and user mentions) resp = source.get_activities_response( fetch_replies=True, fetch_likes=True, fetch_shares=True, fetch_mentions=True, count=50, etag=source.last_activities_etag, min_id=source.last_activity_id, cache=cache) etag = resp.get('etag') # used later user_activities = resp.get('items', []) # these map ids to AS objects responses = {a['id']: a for a in links} activities = {a['id']: a for a in links + user_activities} # extract silo activity ids, update last_activity_id silo_activity_ids = set() last_activity_id = source.last_activity_id for id, activity in activities.items(): # maybe replace stored last activity id parsed = util.parse_tag_uri(id) if parsed: id = parsed[1] silo_activity_ids.add(id) try: # try numeric comparison first greater = int(id) > int(last_activity_id) except (TypeError, ValueError): greater = id > last_activity_id if greater: last_activity_id = id if last_activity_id and last_activity_id != source.last_activity_id: source.updates['last_activity_id'] = last_activity_id # trim cache to just the returned activity ids, so that it doesn't grow # without bound. (WARNING: depends on get_activities_response()'s cache key # format, e.g. 'PREFIX ACTIVITY_ID'!) source.updates['last_activities_cache_json'] = json.dumps( {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}) self.backfeed(source, responses, activities=activities) source.updates.update({'last_polled': source.last_poll_attempt, 'poll_status': 'ok'}) if etag and etag != source.last_activities_etag: source.updates['last_activities_etag'] = etag # # Possibly refetch updated syndication urls. # # if the author has added syndication urls since the first time # original_post_discovery ran, we'll miss them. this cleanup task will # periodically check for updated urls. only kicks in if the author has # *ever* published a rel=syndication url if source.should_refetch(): logging.info('refetching h-feed for source %s', source.label()) relationships = original_post_discovery.refetch(source) now = util.now_fn() source.updates['last_hfeed_refetch'] = now if relationships: logging.info('refetch h-feed found new rel=syndication relationships: %s', relationships) try: self.repropagate_old_responses(source, relationships) except BaseException, e: if (isinstance(e, (datastore_errors.BadRequestError, datastore_errors.Timeout)) or util.is_connection_failure(e)): logging.info('Timeout while repropagating responses.', exc_info=True) else: raise
def dispatch_request(self, site): logger.info(f'Params: {list(request.values.items())}') # strip fragments from source and target url self.source_url = urllib.parse.urldefrag(request.form['source'])[0] self.target_url = urllib.parse.urldefrag(request.form['target'])[0] # follow target url through any redirects, strip utm_* query params resp = util.follow_redirects(self.target_url) redirected_target_urls = [r.url for r in resp.history] self.target_url = util.clean_url(resp.url) # parse and validate target URL domain = util.domain_from_link(self.target_url) if not domain: self.error(f'Could not parse target URL {self.target_url}') # look up source by domain source_cls = models.sources[site] domain = domain.lower() self.source = (source_cls.query() .filter(source_cls.domains == domain) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: # check for a rel-canonical link. Blogger uses these when it serves a post # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs # epeus.blogspot.com. # https://github.com/snarfed/bridgy/issues/805 mf2 = self.fetch_mf2(self.target_url, require_mf2=False) if not mf2: # fetch_mf2() already wrote the error response return domains = util.dedupe_urls( util.domain_from_link(url) for url in mf2[1]['rels'].get('canonical', [])) if domains: self.source = (source_cls.query() .filter(source_cls.domains.IN(domains)) .filter(source_cls.features == 'webmention') .filter(source_cls.status == 'enabled') .get()) if not self.source: self.error( f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?') # check that the target URL path is supported target_path = urllib.parse.urlparse(self.target_url).path if target_path in ('', '/'): msg = 'Home page webmentions are not currently supported.' logger.info(msg) return {'error': msg}, 202 for pattern in self.source.PATH_BLOCKLIST: if pattern.match(target_path): msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}' logger.info(msg) return {'error': msg}, 202 # create BlogWebmention entity id = f'{self.source_url} {self.target_url}' self.entity = BlogWebmention.get_or_insert( id, source=self.source.key, redirected_target_urls=redirected_target_urls) if self.entity.status == 'complete': # TODO: response message saying update isn't supported return self.entity.published logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}') # fetch source page fetched = self.fetch_mf2(self.source_url) if not fetched: return resp, mf2 = fetched item = self.find_mention_item(mf2.get('items', [])) if not item: self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False) # default author to target domain author_name = domain author_url = f'http://{domain}/' # extract author name and URL from h-card, if any props = item['properties'] author = get_first(props, 'author') if author: if isinstance(author, str): author_name = author else: author_props = author.get('properties', {}) author_name = get_first(author_props, 'name') author_url = get_first(author_props, 'url') # if present, u-url overrides source url u_url = get_first(props, 'url') if u_url: self.entity.u_url = u_url # generate content content = props['content'][0] # find_mention_item() guaranteed this is here text = (content.get('html') or content.get('value')).strip() source_url = self.entity.source_url() text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>' # write comment try: self.entity.published = self.source.create_comment( self.target_url, author_name, author_url, text) except Exception as e: code, body = util.interpret_http_exception(e) msg = f'Error: {code}: {e}; {body}' if code == '401': logger.warning(f'Disabling source due to: {e}', exc_info=True) self.source.status = 'disabled' self.source.put() self.error(msg, status=code, report=self.source.is_beta_user()) elif code == '404': # post is gone self.error(msg, status=code, report=False) elif util.is_connection_failure(e) or (code and int(code) // 100 == 5): self.error(msg, status=502, report=False) elif code or body: self.error(msg, status=code, report=True) else: raise # write results to datastore self.entity.status = 'complete' self.entity.put() return self.entity.published