Beispiel #1
0
  def test_is_connection_failure(self):
    for e in (socket.timeout(), socket.error(), requests.ConnectionError(),
              httplib.NotConnected(), urllib2.URLError(socket.gaierror('foo bar')),
              urllib3.exceptions.TimeoutError()):
      assert util.is_connection_failure(e), e

    for e in (None, 3, 'asdf', IOError(), httplib.HTTPException('unknown'),
              urllib2.URLError('asdf'),
              urllib2.HTTPError('url', 403, 'msg', {}, None),
              ):
      assert not util.is_connection_failure(e), e
Beispiel #2
0
def handle_exception(self, e, debug):
  """A webapp2 exception handler that propagates HTTP exceptions into the response.

  Use this as a :meth:`webapp2.RequestHandler.handle_exception()` method by
  adding this line to your handler class definition::

    handle_exception = handlers.handle_exception

  I originally tried to put this in a :class:`webapp2.RequestHandler` subclass,
  but it gave me this exception::

    File ".../webapp2-2.5.1/webapp2_extras/local.py", line 136, in _get_current_object
      raise RuntimeError('no object bound to %s' % self.__name__) RuntimeError: no object bound to app

  These are probably related:

  * http://eemyop.blogspot.com/2013/05/digging-around-in-webapp2-finding-out.html
  * http://code.google.com/p/webapp-improved/source/detail?r=d962ac4625ce3c43a3e59fd7fc07daf8d7b7c46a

  """
  code, body = util.interpret_http_exception(e)
  if code:
    self.response.set_status(int(code))
    self.response.write('HTTP Error %s: %s' % (code, body))
  elif util.is_connection_failure(e):
    self.response.set_status(502)
    self.response.write('Upstream server request failed: %s' % e)
  else:
    raise
Beispiel #3
0
  def post(self):
    source = self.load_source(param='key')
    kind = source.key.kind()
    feature = util.get_required_param(self, 'feature')
    state = util.encode_oauth_state({
      'operation': 'delete',
      'feature': feature,
      'source': source.key.urlsafe().decode(),
      'callback': self.request.get('callback'),
    })

    # Blogger don't support redirect_url() yet
    if kind == 'Blogger':
      return self.redirect('/blogger/delete/start?state=%s' % state)

    path = ('/reddit/callback' if kind == 'Reddit'
            else '/wordpress/add' if kind == 'WordPress'
            else '/%s/delete/finish' % source.SHORT_NAME)
    kwargs = {}
    if kind == 'Twitter':
      kwargs['access_type'] = 'read' if feature == 'listen' else 'write'

    handler = source.OAUTH_START_HANDLER.to(path, **kwargs)(self.request, self.response)
    try:
      self.redirect(handler.redirect_url(state=state))
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      if not code and util.is_connection_failure(e):
        code = '-'
        body = str(e)
      if code:
        self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body))
        self.redirect(source.bridgy_url(self))
      else:
        raise
Beispiel #4
0
  def post(self):
    logging.debug('Params: %s', self.request.params)

    key = util.get_required_param(self, 'source_key')
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    post_id = util.get_required_param(self, 'post_id')
    source.updates = {}

    try:
      activities = source.get_activities(
        fetch_replies=True, fetch_likes=True, fetch_shares=True,
        activity_id=post_id, user_id=source.key.id())
      if not activities:
        logging.info('Post %s not found.', post_id)
        return
      assert len(activities) == 1
      self.backfeed(source, activities={activities[0]['id']: activities[0]})
    except Exception, e:
      code, body = util.interpret_http_exception(e)
      if (code and (code in util.HTTP_RATE_LIMIT_CODES or code == '400' or
                    int(code) / 100 == 5)
            or util.is_connection_failure(e)):
        logging.error('API call failed; giving up. %s: %s\n%s', code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
Beispiel #5
0
  def post(self):
    source = self.load_source(param='key')
    module = self.OAUTH_MODULES[source.key.kind()]
    feature = util.get_required_param(self, 'feature')
    state = util.encode_oauth_state({
      'operation': 'delete',
      'feature': feature,
      'source': source.key.urlsafe(),
      'callback': self.request.get('callback'),
    })

    # Blogger don't support redirect_url() yet
    if module is oauth_blogger_v2:
      return self.redirect('/blogger/delete/start?state=%s' % state)

    path = ('/instagram/callback' if module is indieauth
            else '/wordpress/add' if module is oauth_wordpress_rest
            else '/%s/delete/finish' % source.SHORT_NAME)
    kwargs = {}
    if module is oauth_twitter:
      kwargs['access_type'] = 'read' if feature == 'listen' else 'write'

    handler = module.StartHandler.to(path, **kwargs)(self.request, self.response)
    try:
      self.redirect(handler.redirect_url(state=state))
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      if not code and util.is_connection_failure(e):
        code = '-'
        body = unicode(e)
      if code:
        self.messages.add('%s API error %s: %s' % (source.GR_CLASS.NAME, code, body))
        self.redirect(source.bridgy_url(self))
      else:
        raise
Beispiel #6
0
  def get(self, type, source_short_name, string_id, *ids):
    source_cls = models.sources.get(source_short_name)
    if not source_cls:
      self.abort(400, "Source type '%s' not found. Known sources: %s" %
                 (source_short_name, filter(None, models.sources.keys())))

    self.source = source_cls.get_by_id(string_id)
    if not self.source:
      self.abort(400, 'Source %s %s not found' % (source_short_name, string_id))

    format = self.request.get('format', 'html')
    if format not in ('html', 'json'):
      self.abort(400, 'Invalid format %s, expected html or json' % format)

    for id in ids:
      if not self.VALID_ID.match(id):
        self.abort(404, 'Invalid id %s' % id)

    label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
    logging.info('Fetching %s', label)
    try:
      obj = self.get_item(*ids)
    except Exception, e:
      # pass through all API HTTP errors if we can identify them
      code, body = util.interpret_http_exception(e)
      if not code and util.is_connection_failure(e):
        code = 503
        body = str(e)
      if code:
        self.response.status_int = int(code)
        self.response.headers['Content-Type'] = 'text/plain'
        self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body))
        return
      else:
        raise
Beispiel #7
0
 def post(self):
   try:
     self.redirect(self.redirect_url(state=util.get_required_param(self, 'token')))
   except Exception as e:
     if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]:
       self.messages.add("Couldn't fetch your web site: %s" % e)
       return self.redirect('/')
     raise
Beispiel #8
0
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception, e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code == '401' or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
        if source.is_beta_user():
          util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body)

      elif code in util.HTTP_RATE_LIMIT_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      elif ((code and int(code) / 100 == 5) or
            (code == '400' and isinstance(source, flickr.Flickr)) or
            util.is_connection_failure(e)):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
Beispiel #9
0
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception, e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
        if source.is_beta_user():
          util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body)

      elif code in source.RATE_LIMIT_HTTP_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      elif ((code and int(code) / 100 == 5) or
            (code == '400' and isinstance(source, flickr.Flickr)) or
            util.is_connection_failure(e)):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
Beispiel #10
0
  def post(self):
    ia_start = util.oauth_starter(indieauth.StartHandler).to('/instagram/callback')(
      self.request, self.response)

    try:
      self.redirect(ia_start.redirect_url(me=util.get_required_param(self, 'user_url')))
    except Exception as e:
      if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]:
        self.messages.add("Couldn't fetch your web site: %s" % e)
        return self.redirect('/')
      raise
Beispiel #11
0
  def post(self):
    ia_start = util.oauth_starter(indieauth.StartHandler).to('/instagram/callback')(
      self.request, self.response)

    try:
      self.redirect(ia_start.redirect_url(me=util.get_required_param(self, 'user_url')))
    except Exception as e:
      if util.is_connection_failure(e) or util.interpret_http_exception(e)[0]:
        self.messages.add("Couldn't fetch your web site: %s" % e)
        return self.redirect('/')
      raise
Beispiel #12
0
  def poll(self, source):
    """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
    if source.last_activities_etag or source.last_activity_id:
      logging.debug('Using ETag %s, last activity id %s',
                    source.last_activities_etag, source.last_activity_id)

    #
    # Step 1: fetch activities:
    # * posts by the user
    # * search all posts for the user's domain URLs to find links
    #
    cache = util.CacheDict()
    if source.last_activities_cache_json:
      cache.update(json.loads(source.last_activities_cache_json))

    try:
      # search for links first so that the user's activities and responses
      # override them if they overlap
      links = source.search_for_links()

      # this user's own activities (and user mentions)
      resp = source.get_activities_response(
        fetch_replies=True, fetch_likes=True, fetch_shares=True,
        fetch_mentions=True, count=50, etag=source.last_activities_etag,
        min_id=source.last_activity_id, cache=cache)
      etag = resp.get('etag')  # used later
      user_activities = resp.get('items', [])

      # these map ids to AS objects
      responses = {a['id']: a for a in links}
      activities = {a['id']: a for a in links + user_activities}

    except Exception, e:
      code, body = util.interpret_http_exception(e)
      if code == '401':
        msg = 'Unauthorized error: %s' % e
        logging.warning(msg, exc_info=True)
        source.updates['poll_status'] = 'ok'
        raise models.DisableSource(msg)
      elif code in util.HTTP_RATE_LIMIT_CODES:
        logging.warning('Rate limited. Marking as error and finishing. %s', e)
        source.updates.update({'poll_status': 'error', 'rate_limited': True})
        return
      elif (code and int(code) / 100 == 5) or util.is_connection_failure(e):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(ERROR_HTTP_RETURN_CODE)
      else:
        raise
Beispiel #13
0
    def post(self):
        logging.debug('Params: %s', self.request.params)

        type = self.request.get('type')
        if type:
            assert type in ('event', )

        source = util.load_source(self)
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        post_id = util.get_required_param(self, 'post_id')
        source.updates = {}

        try:
            if type == 'event':
                activities = [source.gr_source.get_event(post_id)]
            else:
                activities = source.get_activities(fetch_replies=True,
                                                   fetch_likes=True,
                                                   fetch_shares=True,
                                                   activity_id=post_id,
                                                   user_id=source.key.id())

            if not activities or not activities[0]:
                logging.info('Post %s not found.', post_id)
                return
            assert len(activities) == 1, activities
            self.backfeed(source,
                          activities={activities[0]['id']: activities[0]})

            obj = activities[0].get('object') or activities[0]
            in_reply_to = util.get_first(obj, 'inReplyTo')
            if in_reply_to:
                parsed = util.parse_tag_uri(in_reply_to.get(
                    'id', ''))  # TODO: fall back to url
                if parsed:
                    util.add_discover_task(source, parsed[1])

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if (code and (code in source.RATE_LIMIT_HTTP_CODES
                          or code in ('400', '404') or int(code) / 100 == 5)
                    or util.is_connection_failure(e)):
                logging.error('API call failed; giving up. %s: %s\n%s', code,
                              body, e)
                self.abort(util.ERROR_HTTP_RETURN_CODE)
            else:
                raise
Beispiel #14
0
    def dispatch_request(self):
        token = request.form['token']

        try:
            to_url = self.redirect_url(state=token)
        except Exception as e:
            if util.is_connection_failure(e) or util.interpret_http_exception(
                    e)[0]:
                flash(f"Couldn't fetch your web site: {e}")
                return redirect('/')
            raise

        return redirect(to_url)
Beispiel #15
0
  def post(self):
    logging.debug('Params: %s', self.request.params)

    type = self.request.get('type')
    if type:
      assert type in ('event',)

    key = util.get_required_param(self, 'source_key')
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    post_id = util.get_required_param(self, 'post_id')
    source.updates = {}

    try:
      if type == 'event':
        activities = [source.gr_source.get_event(post_id)]
      else:
        activities = source.get_activities(
          fetch_replies=True, fetch_likes=True, fetch_shares=True,
          activity_id=post_id, user_id=source.key.id())

      if not activities or not activities[0]:
        logging.info('Post %s not found.', post_id)
        return
      assert len(activities) == 1, activities
      self.backfeed(source, activities={activities[0]['id']: activities[0]})

      in_reply_to = util.get_first(activities[0]['object'], 'inReplyTo')
      if in_reply_to:
        parsed = util.parse_tag_uri(in_reply_to.get('id', ''))  # TODO: fall back to url
        if parsed:
          util.add_discover_task(source, parsed[1])

    except Exception, e:
      code, body = util.interpret_http_exception(e)
      if (code and (code in util.HTTP_RATE_LIMIT_CODES or
                    code in ('400', '404') or
                    int(code) / 100 == 5)
            or util.is_connection_failure(e)):
        logging.error('API call failed; giving up. %s: %s\n%s', code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
Beispiel #16
0
    def post(self):
        key = ndb.Key(urlsafe=util.get_required_param(self, 'key'))
        module = self.OAUTH_MODULES[key.kind()]
        feature = util.get_required_param(self, 'feature')
        state = util.encode_oauth_state({
            'operation':
            'delete',
            'feature':
            feature,
            'source':
            key.urlsafe(),
            'callback':
            self.request.get('callback'),
        })

        # Google+ and Blogger don't support redirect_url() yet
        if module is oauth_googleplus:
            return self.redirect('/googleplus/delete/start?state=%s' % state)

        if module is oauth_blogger_v2:
            return self.redirect('/blogger/delete/start?state=%s' % state)

        source = key.get()
        path = ('/instagram/callback' if module is indieauth else
                '/wordpress/add' if module is oauth_wordpress_rest else
                '/%s/delete/finish' % source.SHORT_NAME)
        kwargs = {}
        if module is oauth_twitter:
            kwargs['access_type'] = 'read' if feature == 'listen' else 'write'

        handler = module.StartHandler.to(path, **kwargs)(self.request,
                                                         self.response)
        try:
            self.redirect(handler.redirect_url(state=state))
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            if not code and util.is_connection_failure(e):
                code = '-'
                body = unicode(e)
            if code:
                self.messages.add('%s API error %s: %s' %
                                  (source.GR_CLASS.NAME, code, body))
                self.redirect(source.bridgy_url(self))
            else:
                raise
Beispiel #17
0
    def get(self, type, source_short_name, string_id, *ids):
        source_cls = models.sources.get(source_short_name)
        if not source_cls:
            self.abort(
                400, "Source type '%s' not found. Known sources: %s" %
                (source_short_name, filter(None, models.sources.keys())))

        self.source = source_cls.get_by_id(string_id)
        if not self.source:
            self.abort(
                400, 'Source %s %s not found' % (source_short_name, string_id))

        format = self.request.get('format', 'html')
        if format not in ('html', 'json'):
            self.abort(400,
                       'Invalid format %s, expected html or json' % format)

        for id in ids:
            if not self.VALID_ID.match(id):
                self.abort(404, 'Invalid id %s' % id)

        label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
        cache_key = 'H ' + label
        obj = memcache.get(cache_key)
        if obj:
            logging.info('Using cached object for %s', label)
        else:
            logging.info('Fetching %s', label)
            try:
                obj = self.get_item(*ids)
            except Exception, e:
                # pass through all API HTTP errors if we can identify them
                code, body = util.interpret_http_exception(e)
                if not code and util.is_connection_failure(e):
                    code = 503
                    body = str(e)
                if code:
                    self.response.status_int = int(code)
                    self.response.headers['Content-Type'] = 'text/plain'
                    self.response.write('%s error:\n%s' %
                                        (self.source.GR_CLASS.NAME, body))
                    return
                else:
                    raise
            memcache.set(cache_key, obj, time=CACHE_TIME)
Beispiel #18
0
    def post(self, *path_args):
        logging.debug("Params: %s", self.request.params)

        key = self.request.params["source_key"]
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == "disabled" or "listen" not in source.features:
            logging.error("Source not found or disabled. Dropping task.")
            return
        logging.info("Source: %s %s, %s", source.label(), source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params["last_polled"]
        if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
            logging.warning("duplicate poll task! deferring to the other task.")
            return

        logging.info(
            "Last poll: %s/log?start_time=%s&key=%s",
            self.request.host_url,
            calendar.timegm(source.last_poll_attempt.utctimetuple()),
            source.key.urlsafe(),
        )

        # mark this source as polling
        source.updates = {"poll_status": "polling", "last_poll_attempt": util.now_fn()}
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except Exception, e:
            source.updates["poll_status"] = "error"
            code, body = util.interpret_http_exception(e)
            if code == "401" or isinstance(e, models.DisableSource):
                # the user deauthorized the bridgy app, so disable this source.
                # let the task complete successfully so that it's not retried.
                logging.warning("Disabling source due to: %s" % e, exc_info=True)
                source.updates.update({"status": "disabled", "poll_status": "ok"})
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning("Rate limited. Marking as error and finishing. %s", e)
                source.updates["rate_limited"] = True
            elif (code and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error("API call failed. Marking as error and finishing. %s: %s\n%s", code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise
Beispiel #19
0
def background_handle_exception(e):
    """Common exception handler for background tasks.

  Catches failed outbound HTTP requests and returns HTTP 304.
  """
    if isinstance(e, HTTPException):
        # raised by this app itself, pass it through
        return str(e), e.code

    transients = getattr(g, 'TRANSIENT_ERROR_HTTP_CODES', ())
    source = getattr(g, 'source', None)
    if source:
        transients += source.RATE_LIMIT_HTTP_CODES + source.TRANSIENT_ERROR_HTTP_CODES

    code, body = util.interpret_http_exception(e)
    if ((code and int(code) // 100 == 5) or code in transients
            or util.is_connection_failure(e)):
        logger.error(f'Marking as error and finishing. {code}: {body}\n{e}')
        return '', util.ERROR_HTTP_RETURN_CODE

    raise e
Beispiel #20
0
def delete_start():
    source = util.load_source()
    kind = source.key.kind()
    feature = request.form['feature']
    state = util.encode_oauth_state({
        'operation': 'delete',
        'feature': feature,
        'source': source.key.urlsafe().decode(),
        'callback': request.values.get('callback'),
    })

    # Blogger don't support redirect_url() yet
    if kind == 'Blogger':
        return redirect(f'/blogger/delete/start?state={state}')

    path = ('/reddit/callback' if kind == 'Reddit' else '/wordpress/add'
            if kind == 'WordPress' else f'/{source.SHORT_NAME}/delete/finish')
    kwargs = {}
    if kind == 'Twitter':
        kwargs['access_type'] = 'read' if feature == 'listen' else 'write'

    try:
        return redirect(source.OAUTH_START(path).redirect_url(state=state))
    except werkzeug.exceptions.HTTPException:
        # raised by us, probably via self.error()
        raise
    except Exception as e:
        code, body = util.interpret_http_exception(e)
        if not code and util.is_connection_failure(e):
            code = '-'
            body = str(e)
        if code:
            flash(f'{source.GR_CLASS.NAME} API error {code}: {body}')
            return redirect(source.bridgy_url())
        else:
            raise
Beispiel #21
0
    def post(self, source_short_name):
        logging.info('Params: %self', self.request.params.items())
        # strip fragments from source and target url
        self.source_url = urlparse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urlparse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        if urlparse.urlparse(self.target_url).path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.')

        # create BlogWebmention entity
        id = u'%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

        # fetch source page
        resp = self.fetch_mf2(self.source_url)
        if not resp:
            return
        self.fetched, data = resp

        item = self.find_mention_item(data)
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, self.fetched.url),
                data=data,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, basestring):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                exc_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  mail=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, mail=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  mail=False)
            elif code or body:
                return self.error(msg, status=code, mail=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json.dumps(self.entity.published))
Beispiel #22
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json_loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(fetch_replies=True,
                                              fetch_likes=True,
                                              fetch_shares=True,
                                              fetch_mentions=True,
                                              count=50,
                                              etag=source.last_activities_etag,
                                              min_id=source.last_activity_id,
                                              cache=cache)
        etag = resp.get('etag')  # used later
        user_activities = resp.get('items', [])

        # these map ids to AS objects
        responses = {a['id']: a for a in links}
        activities = {a['id']: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = str(id) > str(last_activity_id)
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json_dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        self.backfeed(source, responses, activities=activities)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Possibly refetch updated syndication urls.
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException as e:
                    if ('BadRequestError' in str(e.__class__)
                            or 'Timeout' in str(e.__class__)
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     stack_info=True)
                    else:
                        raise
        else:
            logging.info(
                'skipping refetch h-feed. last-syndication-url %s, last-refetch %s',
                source.last_syndication_url, source.last_hfeed_refetch)
Beispiel #23
0
  def _run(self):
    """Returns CreationResult on success, None otherwise."""
    logging.info('Params: %s', self.request.params.items())
    assert self.PREVIEW in (True, False)

    # parse and validate target URL
    try:
      parsed = urlparse.urlparse(self.target_url())
    except BaseException:
      return self.error('Could not parse target URL %s' % self.target_url())

    domain = parsed.netloc
    path_parts = parsed.path.rsplit('/', 1)
    source_cls = SOURCE_NAMES.get(path_parts[-1])
    if (domain not in ('brid.gy', 'www.brid.gy', 'localhost:8080') or
        len(path_parts) != 2 or path_parts[0] != '/publish' or not source_cls):
      return self.error(
        'Target must be brid.gy/publish/{facebook,flickr,twitter,instagram}')
    elif source_cls == GooglePlusPage:
      return self.error('Sorry, %s is not yet supported.' %
                        source_cls.GR_CLASS.NAME)

    # resolve source URL
    url, domain, ok = util.get_webmention_target(
      self.source_url(), replace_test_domains=False)
    # show nice error message if they're trying to publish a silo post
    if domain in SOURCE_DOMAINS:
      return self.error(
        "Looks like that's a %s URL. Try one from your web site instead!" %
        SOURCE_DOMAINS[domain].GR_CLASS.NAME)
    elif not ok:
      return self.error('Unsupported source URL %s' % url)
    elif not domain:
      return self.error('Could not parse source URL %s' % url)

    # look up source by domain
    domain = domain.lower()
    sources = source_cls.query().filter(source_cls.domains == domain).fetch(100)
    if not sources:
      return self.error("Could not find <b>%(type)s</b> account for <b>%(domain)s</b>. Check that your %(type)s profile has %(domain)s in its <em>web site</em> or <em>link</em> field, then try signing up again." %
        {'type': source_cls.GR_CLASS.NAME, 'domain': domain})

    current_url = ''
    for source in sources:
      logging.info('Source: %s , features %s, status %s, poll status %s',
                   source.bridgy_url(self), source.features, source.status,
                   source.poll_status)
      if source.status != 'disabled' and 'publish' in source.features:
        # use a source that has a domain_url matching the url provided.
        # look through each source to find the one with the closest match.
        schemeless_url = util.schemeless(url.lower()).strip('/')
        for domain_url in source.domain_urls:
          schemeless_domain_url = util.schemeless(domain_url.lower()).strip('/')
          if (schemeless_url.startswith(schemeless_domain_url) and
              len(domain_url) > len(current_url)):
            self.source = source
            current_url = domain_url

    if not self.source:
      return self.error(
        'Publish is not enabled for your account. Please visit https://brid.gy and sign up!')

    content_param = 'bridgy_%s_content' % self.source.SHORT_NAME
    if content_param in self.request.params:
      return self.error('The %s parameter is not supported' % content_param)

    # show nice error message if they're trying to publish their home page
    for domain_url in self.source.domain_urls:
      domain_url_parts = urlparse.urlparse(domain_url)
      source_url_parts = urlparse.urlparse(self.source_url())
      if (source_url_parts.netloc == domain_url_parts.netloc and
          source_url_parts.path.strip('/') == domain_url_parts.path.strip('/') and
          not source_url_parts.query):
        return self.error(
          "Looks like that's your home page. Try one of your posts instead!")

    # done with the sanity checks, ready to fetch the source url. create the
    # Publish entity so we can store the result.
    entity = self.get_or_add_publish_entity(url)
    if (entity.status == 'complete' and entity.type != 'preview' and
        not self.PREVIEW and not appengine_config.DEBUG):
      return self.error("Sorry, you've already published that page, and Bridgy Publish doesn't yet support updating or deleting existing posts. Ping Ryan if you want that feature!")
    self.entity = entity

    # fetch source page
    resp = self.fetch_mf2(url)
    if not resp:
      return
    self.fetched, data = resp

    # find rel-shortlink, if any
    # http://microformats.org/wiki/rel-shortlink
    # https://github.com/snarfed/bridgy/issues/173
    soup = util.beautifulsoup_parse(self.fetched.text)
    shortlinks = (soup.find_all('link', rel='shortlink') +
                  soup.find_all('a', rel='shortlink') +
                  soup.find_all('a', class_='shortlink'))
    if shortlinks:
      self.shortlink = shortlinks[0]['href']

    # loop through each item and its children and try to preview/create it. if
    # it fails, try the next one. break after the first one that works.
    result = None
    types = set()
    queue = collections.deque(data.get('items', []))
    while queue:
      item = queue.popleft()
      item_types = set(item.get('type'))
      if 'h-feed' in item_types and 'h-entry' not in item_types:
        queue.extend(item.get('children', []))
        continue
      elif not item_types & PUBLISHABLE_TYPES:
        continue

      try:
        result = self.attempt_single_item(item)
        if self.entity.published:
          break
        if result.abort:
          if result.error_plain:
            self.error(result.error_plain, html=result.error_html, data=item)
          return
        # try the next item
        for embedded in ('rsvp', 'invitee', 'repost', 'repost-of', 'like',
                         'like-of', 'in-reply-to'):
          if embedded in item.get('properties', []):
            item_types.add(embedded)
        logging.info(
          'Object type(s) %s not supported; error=%s; trying next.',
          item_types, result.error_plain)
        types = types.union(item_types)
        queue.extend(item.get('children', []))
      except BaseException, e:
        code, body = util.interpret_http_exception(e)
        mail = True
        if (not code or code == 500) and util.is_connection_failure(e):
          code = 502
          mail=False
        msg = '%s API error: %s %s' % (self.source.GR_CLASS.NAME, body or '', e)
        return self.error(msg, status=code or 500, mail=mail)
Beispiel #24
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise
Beispiel #25
0
    def post(self, source_short_name):
        logging.info('Params: %s', list(self.request.params.items()))
        # strip fragments from source and target url
        self.source_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'source'))[0]
        self.target_url = urllib.parse.urldefrag(
            util.get_required_param(self, 'target'))[0]

        # follow target url through any redirects, strip utm_* query params
        resp = util.follow_redirects(self.target_url)
        redirected_target_urls = [r.url for r in resp.history]
        self.target_url = util.clean_url(resp.url)

        # parse and validate target URL
        domain = util.domain_from_link(self.target_url)
        if not domain:
            return self.error('Could not parse target URL %s' %
                              self.target_url)

        # look up source by domain
        source_cls = models.sources[source_short_name]
        domain = domain.lower()
        self.source = (source_cls.query().filter(
            source_cls.domains == domain).filter(
                source_cls.features == 'webmention').filter(
                    source_cls.status == 'enabled').get())
        if not self.source:
            # check for a rel-canonical link. Blogger uses these when it serves a post
            # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
            # epeus.blogspot.com.
            # https://github.com/snarfed/bridgy/issues/805
            mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
            if not mf2:
                # fetch_mf2() already wrote the error response
                return
            domains = util.dedupe_urls(
                util.domain_from_link(url)
                for url in mf2[1]['rels'].get('canonical', []))
            if domains:
                self.source = (source_cls.query().filter(
                    source_cls.domains.IN(domains)).filter(
                        source_cls.features == 'webmention').filter(
                            source_cls.status == 'enabled').get())

        if not self.source:
            return self.error(
                'Could not find %s account for %s. Is it registered with Bridgy?'
                % (source_cls.GR_CLASS.NAME, domain))

        # check that the target URL path is supported
        target_path = urllib.parse.urlparse(self.target_url).path
        if target_path in ('', '/'):
            return self.error(
                'Home page webmentions are not currently supported.',
                status=202)
        for pattern in self.source.PATH_BLOCKLIST:
            if pattern.match(target_path):
                return self.error(
                    '%s webmentions are not supported for URL path: %s' %
                    (self.source.GR_CLASS.NAME, target_path),
                    status=202)

        # create BlogWebmention entity
        id = '%s %s' % (self.source_url, self.target_url)
        self.entity = BlogWebmention.get_or_insert(
            id,
            source=self.source.key,
            redirected_target_urls=redirected_target_urls)
        if self.entity.status == 'complete':
            # TODO: response message saying update isn't supported
            self.response.write(self.entity.published)
            return
        logging.debug("BlogWebmention entity: '%s'",
                      self.entity.key.urlsafe().decode())

        # fetch source page
        fetched = self.fetch_mf2(self.source_url)
        if not fetched:
            return
        resp, mf2 = fetched

        item = self.find_mention_item(mf2.get('items', []))
        if not item:
            return self.error(
                'Could not find target URL %s in source page %s' %
                (self.target_url, resp.url),
                data=mf2,
                log_exception=False)

        # default author to target domain
        author_name = domain
        author_url = 'http://%s/' % domain

        # extract author name and URL from h-card, if any
        props = item['properties']
        author = first_value(props, 'author')
        if author:
            if isinstance(author, str):
                author_name = author
            else:
                author_props = author.get('properties', {})
                author_name = first_value(author_props, 'name')
                author_url = first_value(author_props, 'url')

        # if present, u-url overrides source url
        u_url = first_value(props, 'url')
        if u_url:
            self.entity.u_url = u_url

        # generate content
        content = props['content'][
            0]  # find_mention_item() guaranteed this is here
        text = (content.get('html') or content.get('value')).strip()
        source_url = self.entity.source_url()
        text += ' <br /> <a href="%s">via %s</a>' % (
            source_url, util.domain_from_link(source_url))

        # write comment
        try:
            self.entity.published = self.source.create_comment(
                self.target_url, author_name, author_url, text)
        except Exception as e:
            code, body = util.interpret_http_exception(e)
            msg = 'Error: %s %s; %s' % (code, e, body)
            if code == '401':
                logging.warning('Disabling source due to: %s' % e,
                                stack_info=True)
                self.source.status = 'disabled'
                self.source.put()
                return self.error(msg,
                                  status=code,
                                  report=self.source.is_beta_user())
            elif code == '404':
                # post is gone
                return self.error(msg, status=code, report=False)
            elif util.is_connection_failure(e) or (code
                                                   and int(code) // 100 == 5):
                return self.error(msg,
                                  status=util.ERROR_HTTP_RETURN_CODE,
                                  report=False)
            elif code or body:
                return self.error(msg, status=code, report=True)
            else:
                raise

        # write results to datastore
        self.entity.status = 'complete'
        self.entity.put()
        self.response.write(json_dumps(self.entity.published))
Beispiel #26
0
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise
Beispiel #27
0
  def get(self, type, source_short_name, string_id, *ids):
    source_cls = models.sources.get(source_short_name)
    if not source_cls:
      self.abort(400, "Source type '%s' not found. Known sources: %s" %
                 (source_short_name, filter(None, models.sources.keys())))

    self.source = source_cls.get_by_id(string_id)
    if not self.source:
      self.abort(400, 'Source %s %s not found' % (source_short_name, string_id))
    elif self.source.status == 'disabled' or 'listen' not in self.source.features:
      self.abort(400, 'Source %s is disabled for backfeed' % self.source.bridgy_path())

    format = self.request.get('format', 'html')
    if format not in ('html', 'json'):
      self.abort(400, 'Invalid format %s, expected html or json' % format)

    for id in ids:
      if not self.VALID_ID.match(id):
        self.abort(404, 'Invalid id %s' % id)

    label = '%s:%s %s %s' % (source_short_name, string_id, type, ids)
    cache_key = 'H ' + label
    obj = memcache.get(cache_key)
    if obj:
      logging.info('Using cached object for %s', label)
    else:
      logging.info('Fetching %s', label)
      try:
        obj = self.get_item(*ids)
      except models.DisableSource as e:
        self.abort(401, "Bridgy's access to your account has expired. Please visit https://brid.gy/ to refresh it!")
      except Exception as e:
        # pass through all API HTTP errors if we can identify them
        code, body = util.interpret_http_exception(e)
        if not code and util.is_connection_failure(e):
          code = 503
          body = str(e)
        if code:
          self.response.status_int = int(code)
          self.response.headers['Content-Type'] = 'text/plain'
          self.response.write('%s error:\n%s' % (self.source.GR_CLASS.NAME, body))
          return
        else:
          raise
      memcache.set(cache_key, obj, time=CACHE_TIME)

    if not obj:
      self.abort(404, label)

    # use https for profile pictures so we don't cause SSL mixed mode errors
    # when serving over https.
    author = obj.get('author', {})
    image = author.get('image', {})
    url = image.get('url')
    if url:
      image['url'] = util.update_scheme(url, self)

    mf2_json = microformats2.object_to_json(obj, synthesize_content=False)

    # try to include the author's silo profile url
    author = first_props(mf2_json.get('properties', {})).get('author', {})
    author_uid = first_props(author.get('properties', {})).get('uid', '')
    if author_uid:
      parsed = util.parse_tag_uri(author_uid)
      if parsed:
        silo_url = self.source.gr_source.user_url(parsed[1])
        urls = author.get('properties', {}).setdefault('url', [])
        if silo_url not in microformats2.get_string_urls(urls):
          urls.append(silo_url)

    # write the response!
    self.response.headers['Access-Control-Allow-Origin'] = '*'
    if format == 'html':
      self.response.headers['Content-Type'] = 'text/html; charset=utf-8'
      self.response.out.write(TEMPLATE.substitute({
            'url': obj.get('url', ''),
            'body': microformats2.json_to_html(mf2_json),
            'title': self.get_title(obj),
            }))
    elif format == 'json':
      self.response.headers['Content-Type'] = 'application/json; charset=utf-8'
      self.response.out.write(json.dumps(mf2_json, indent=2))
Beispiel #28
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise
Beispiel #29
0
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception as e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
      elif code in source.RATE_LIMIT_HTTP_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      elif ((code and int(code) // 100 == 5) or
            code in source.TRANSIENT_ERROR_HTTP_CODES or
            util.is_connection_failure(e)):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
    finally:
      source = models.Source.put_updates(source)

    # add new poll task. randomize task ETA to within +/- 20% to try to spread
    # out tasks and prevent thundering herds.
    task_countdown = source.poll_period().total_seconds() * random.uniform(.8, 1.2)
    util.add_poll_task(source, countdown=task_countdown)

    # feeble attempt to avoid hitting the instance memory limit
    source = None
    gc.collect()
Beispiel #30
0
  def post(self, source_short_name):
    logging.info('Params: %self', self.request.params.items())
    # strip fragments from source and target url
    self.source_url = urlparse.urldefrag(util.get_required_param(self, 'source'))[0]
    self.target_url = urlparse.urldefrag(util.get_required_param(self, 'target'))[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      return self.error('Could not parse target URL %s' % self.target_url)

    # look up source by domain
    source_cls = models.sources[source_short_name]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1].get('rels', {}).get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      return self.error(
        'Could not find %s account for %s. Is it registered with Bridgy?' %
        (source_cls.GR_CLASS.NAME, domain))

    # check that the target URL path is supported
    target_path = urlparse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      return self.error('Home page webmentions are not currently supported.',
                        status=202)
    for pattern in self.source.PATH_BLACKLIST:
      if pattern.match(target_path):
        return self.error('%s webmentions are not supported for URL path: %s' %
                          (self.source.GR_CLASS.NAME, target_path), status=202)

    # create BlogWebmention entity
    id = '%s %s' % (self.source_url, self.target_url)
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      self.response.write(self.entity.published)
      return
    logging.debug("BlogWebmention entity: '%s'", self.entity.key.urlsafe())

    # fetch source page
    resp = self.fetch_mf2(self.source_url)
    if not resp:
      return
    self.fetched, data = resp

    item = self.find_mention_item(data.get('items', []))
    if not item:
      return self.error('Could not find target URL %s in source page %s' %
                        (self.target_url, self.fetched.url),
                        data=data, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = 'http://%s/' % domain

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = first_value(props, 'author')
    if author:
      if isinstance(author, basestring):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = first_value(author_props, 'name')
        author_url = first_value(author_props, 'url')

    # if present, u-url overrides source url
    u_url = first_value(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += ' <br /> <a href="%s">via %s</a>' % (
      source_url, util.domain_from_link(source_url))

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = 'Error: %s %s; %s' % (code, e, body)
      if code == '401':
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        return self.error(msg, status=code, mail=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        return self.error(msg, status=code, mail=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        return self.error(msg, status=util.ERROR_HTTP_RETURN_CODE, mail=False)
      elif code or body:
        return self.error(msg, status=code, mail=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()
    self.response.write(json.dumps(self.entity.published))
Beispiel #31
0
  def poll(self, source):
    """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
    if source.last_activities_etag or source.last_activity_id:
      logging.debug('Using ETag %s, last activity id %s',
                    source.last_activities_etag, source.last_activity_id)

    #
    # Step 1: fetch activities:
    # * posts by the user
    # * search all posts for the user's domain URLs to find links
    #
    cache = util.CacheDict()
    if source.last_activities_cache_json:
      cache.update(json.loads(source.last_activities_cache_json))

    # search for links first so that the user's activities and responses
    # override them if they overlap
    links = source.search_for_links()

    # this user's own activities (and user mentions)
    resp = source.get_activities_response(
      fetch_replies=True, fetch_likes=True, fetch_shares=True,
      fetch_mentions=True, count=50, etag=source.last_activities_etag,
      min_id=source.last_activity_id, cache=cache)
    etag = resp.get('etag')  # used later
    user_activities = resp.get('items', [])

    # these map ids to AS objects
    responses = {a['id']: a for a in links}
    activities = {a['id']: a for a in links + user_activities}

    # extract silo activity ids, update last_activity_id
    silo_activity_ids = set()
    last_activity_id = source.last_activity_id
    for id, activity in activities.items():
      # maybe replace stored last activity id
      parsed = util.parse_tag_uri(id)
      if parsed:
        id = parsed[1]
      silo_activity_ids.add(id)
      try:
        # try numeric comparison first
        greater = int(id) > int(last_activity_id)
      except (TypeError, ValueError):
        greater = id > last_activity_id
      if greater:
        last_activity_id = id

    if last_activity_id and last_activity_id != source.last_activity_id:
      source.updates['last_activity_id'] = last_activity_id

    # trim cache to just the returned activity ids, so that it doesn't grow
    # without bound. (WARNING: depends on get_activities_response()'s cache key
    # format, e.g. 'PREFIX ACTIVITY_ID'!)
    source.updates['last_activities_cache_json'] = json.dumps(
      {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids})

    self.backfeed(source, responses, activities=activities)

    source.updates.update({'last_polled': source.last_poll_attempt,
                           'poll_status': 'ok'})
    if etag and etag != source.last_activities_etag:
      source.updates['last_activities_etag'] = etag

    #
    # Possibly refetch updated syndication urls.
    #
    # if the author has added syndication urls since the first time
    # original_post_discovery ran, we'll miss them. this cleanup task will
    # periodically check for updated urls. only kicks in if the author has
    # *ever* published a rel=syndication url
    if source.should_refetch():
      logging.info('refetching h-feed for source %s', source.label())
      relationships = original_post_discovery.refetch(source)

      now = util.now_fn()
      source.updates['last_hfeed_refetch'] = now

      if relationships:
        logging.info('refetch h-feed found new rel=syndication relationships: %s',
                     relationships)
        try:
          self.repropagate_old_responses(source, relationships)
        except BaseException, e:
          if (isinstance(e, (datastore_errors.BadRequestError,
                             datastore_errors.Timeout)) or
              util.is_connection_failure(e)):
            logging.info('Timeout while repropagating responses.', exc_info=True)
          else:
            raise
Beispiel #32
0
  def dispatch_request(self, site):
    logger.info(f'Params: {list(request.values.items())}')
    # strip fragments from source and target url
    self.source_url = urllib.parse.urldefrag(request.form['source'])[0]
    self.target_url = urllib.parse.urldefrag(request.form['target'])[0]

    # follow target url through any redirects, strip utm_* query params
    resp = util.follow_redirects(self.target_url)
    redirected_target_urls = [r.url for r in resp.history]
    self.target_url = util.clean_url(resp.url)

    # parse and validate target URL
    domain = util.domain_from_link(self.target_url)
    if not domain:
      self.error(f'Could not parse target URL {self.target_url}')

    # look up source by domain
    source_cls = models.sources[site]
    domain = domain.lower()
    self.source = (source_cls.query()
                   .filter(source_cls.domains == domain)
                   .filter(source_cls.features == 'webmention')
                   .filter(source_cls.status == 'enabled')
                   .get())
    if not self.source:
      # check for a rel-canonical link. Blogger uses these when it serves a post
      # from multiple domains, e.g country TLDs like epeus.blogspot.co.uk vs
      # epeus.blogspot.com.
      # https://github.com/snarfed/bridgy/issues/805
      mf2 = self.fetch_mf2(self.target_url, require_mf2=False)
      if not mf2:
        # fetch_mf2() already wrote the error response
        return
      domains = util.dedupe_urls(
        util.domain_from_link(url)
        for url in mf2[1]['rels'].get('canonical', []))
      if domains:
        self.source = (source_cls.query()
                       .filter(source_cls.domains.IN(domains))
                       .filter(source_cls.features == 'webmention')
                       .filter(source_cls.status == 'enabled')
                       .get())

    if not self.source:
      self.error(
        f'Could not find {source_cls.GR_CLASS.NAME} account for {domain}. Is it registered with Bridgy?')

    # check that the target URL path is supported
    target_path = urllib.parse.urlparse(self.target_url).path
    if target_path in ('', '/'):
      msg = 'Home page webmentions are not currently supported.'
      logger.info(msg)
      return {'error': msg}, 202
    for pattern in self.source.PATH_BLOCKLIST:
      if pattern.match(target_path):
        msg = f'{self.source.GR_CLASS.NAME} webmentions are not supported for URL path: {target_path}'
        logger.info(msg)
        return {'error': msg}, 202

    # create BlogWebmention entity
    id = f'{self.source_url} {self.target_url}'
    self.entity = BlogWebmention.get_or_insert(
      id, source=self.source.key, redirected_target_urls=redirected_target_urls)
    if self.entity.status == 'complete':
      # TODO: response message saying update isn't supported
      return self.entity.published
    logger.debug(f'BlogWebmention entity: {self.entity.key.urlsafe().decode()}')

    # fetch source page
    fetched = self.fetch_mf2(self.source_url)
    if not fetched:
      return
    resp, mf2 = fetched

    item = self.find_mention_item(mf2.get('items', []))
    if not item:
      self.error(f'Could not find target URL {self.target_url} in source page {resp.url}', data=mf2, log_exception=False)

    # default author to target domain
    author_name = domain
    author_url = f'http://{domain}/'

    # extract author name and URL from h-card, if any
    props = item['properties']
    author = get_first(props, 'author')
    if author:
      if isinstance(author, str):
        author_name = author
      else:
        author_props = author.get('properties', {})
        author_name = get_first(author_props, 'name')
        author_url = get_first(author_props, 'url')

    # if present, u-url overrides source url
    u_url = get_first(props, 'url')
    if u_url:
      self.entity.u_url = u_url

    # generate content
    content = props['content'][0]  # find_mention_item() guaranteed this is here
    text = (content.get('html') or content.get('value')).strip()
    source_url = self.entity.source_url()
    text += f' <br /> <a href="{source_url}">via {util.domain_from_link(source_url)}</a>'

    # write comment
    try:
      self.entity.published = self.source.create_comment(
        self.target_url, author_name, author_url, text)
    except Exception as e:
      code, body = util.interpret_http_exception(e)
      msg = f'Error: {code}: {e}; {body}'
      if code == '401':
        logger.warning(f'Disabling source due to: {e}', exc_info=True)
        self.source.status = 'disabled'
        self.source.put()
        self.error(msg, status=code, report=self.source.is_beta_user())
      elif code == '404':
        # post is gone
        self.error(msg, status=code, report=False)
      elif util.is_connection_failure(e) or (code and int(code) // 100 == 5):
        self.error(msg, status=502, report=False)
      elif code or body:
        self.error(msg, status=code, report=True)
      else:
        raise

    # write results to datastore
    self.entity.status = 'complete'
    self.entity.put()

    return self.entity.published