コード例 #1
0
    def lease(self, key):
        """Attempts to acquire and lease the :class:`models.Webmentions` entity.

    Returns True on success, False or None otherwise.

    TODO: unify with :meth:`complete()`

    Args:
      key: :class:`ndb.Key`
    """
        self.entity = key.get()

        if self.entity is None:
            self.fail('no entity!')
        elif self.entity.status == 'complete':
            # let this task return 200 and finish
            logging.warning('duplicate task already propagated this')
        elif (self.entity.status == 'processing'
              and util.now_fn() < self.entity.leased_until):
            self.fail('duplicate task is currently processing!')
        else:
            assert self.entity.status in ('new', 'processing',
                                          'error'), self.entity.status
            self.entity.status = 'processing'
            self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH
            self.entity.put()
            return True
コード例 #2
0
ファイル: tasks.py プロジェクト: snarfed/bridgy
    def lease(self, key):
        """Attempts to acquire and lease the :class:`models.Webmentions` entity.

    Returns True on success, False or None otherwise.

    TODO: unify with :meth:`complete()`

    Args:
      key: :class:`ndb.Key`
    """
        self.entity = key.get()

        if self.entity is None:
            self.fail("no entity!")
        elif self.entity.status == "complete":
            # let this task return 200 and finish
            logging.warning("duplicate task already propagated this")
        elif self.entity.status == "processing" and util.now_fn() < self.entity.leased_until:
            self.fail("duplicate task is currently processing!")
        else:
            assert self.entity.status in ("new", "processing", "error"), self.entity.status
            self.entity.status = "processing"
            self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH
            self.entity.put()
            return True
コード例 #3
0
  def setup_instagram(self, batch_size=None, weekday=0):
    if batch_size:
      self.mox.stubs.Set(cron.UpdateInstagramPictures, 'BATCH', batch_size)

    self.mox.StubOutWithMock(util, 'now_fn')
    # 2017-01-02 is a Monday, which datetime.weekday() returns 0 for
    util.now_fn().AndReturn(datetime.datetime(2017, 1, 2 + weekday))
コード例 #4
0
ファイル: tasks.py プロジェクト: tantek/bridgy
  def lease(self, key):
    """Attempts to acquire and lease the Webmentions entity.

    Returns True on success, False or None otherwise.

    TODO: unify with complete()

    Args:
      key: ndb.Key
    """
    self.entity = key.get()

    if self.entity is None:
      self.fail('no entity!')
    elif self.entity.status == 'complete':
      # let this task return 200 and finish
      logging.warning('duplicate task already propagated this')
    elif (self.entity.status == 'processing' and
          util.now_fn() < self.entity.leased_until):
      self.fail('duplicate task is currently processing!')
    else:
      assert self.entity.status in ('new', 'processing', 'error')
      self.entity.status = 'processing'
      self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH
      self.entity.put()
      return True
コード例 #5
0
  def setup_instagram(self, batch_size=None, weekday=0):
    self.mox.stubs.Set(appengine_config, 'INSTAGRAM_SESSIONID_COOKIE', None)
    if batch_size:
      self.mox.stubs.Set(cron.UpdateInstagramPictures, 'BATCH', batch_size)

    self.mox.StubOutWithMock(util, 'now_fn')
    # 2017-01-02 is a Monday, which datetime.weekday() returns 0 for
    util.now_fn().AndReturn(datetime.datetime(2017, 1, 2 + weekday))
コード例 #6
0
def replace_poll_tasks():
    """Finds sources missing their poll tasks and adds new ones."""
    queries = [
        cls.query(Source.features == 'listen', Source.status == 'enabled',
                  Source.last_poll_attempt < util.now_fn() - timedelta(days=2))
        for cls in models.sources.values() if cls.AUTO_POLL
    ]
    for source in itertools.chain(*queries):
        age = util.now_fn() - source.last_poll_attempt
        logger.info(
            f'{source.bridgy_url()} last polled {age} ago. Adding new poll task.'
        )
        util.add_poll_task(source)

    return ''
コード例 #7
0
ファイル: pages.py プロジェクト: snarfed/bridgy
def discover():
    source = util.load_source()

    # validate URL, find silo post
    url = request.form['url']
    domain = util.domain_from_link(url)
    path = urllib.parse.urlparse(url).path
    msg = 'Discovering now. Refresh in a minute to see the results!'

    gr_source = source.gr_source
    if domain == gr_source.DOMAIN:
        post_id = gr_source.post_id(url)
        if post_id:
            type = 'event' if path.startswith('/events/') else None
            util.add_discover_task(source, post_id, type=type)
        else:
            msg = f"Sorry, that doesn't look like a {gr_source.NAME} post URL."

    elif util.domain_or_parent_in(domain, source.domains):
        synd_links = original_post_discovery.process_entry(
            source, url, {}, False, [])
        if synd_links:
            for link in synd_links:
                util.add_discover_task(source, gr_source.post_id(link))
            source.updates = {'last_syndication_url': util.now_fn()}
            models.Source.put_updates(source)
        else:
            msg = f'Failed to fetch {util.pretty_link(url)} or find a {gr_source.NAME} syndication link.'

    else:
        msg = f'Please enter a URL on either your web site or {gr_source.NAME}.'

    flash(msg)
    return redirect(source.bridgy_url())
コード例 #8
0
ファイル: app.py プロジェクト: snarfed/bridgy
  def post(self):
    source = self.load_source()

    # validate URL, find silo post
    url = util.get_required_param(self, 'url')
    domain = util.domain_from_link(url)
    path = urlparse.urlparse(url).path
    msg = 'Discovering now. Refresh in a minute to see the results!'

    if domain == source.GR_CLASS.DOMAIN:
      post_id = source.GR_CLASS.post_id(url)
      if post_id:
        type = 'event' if path.startswith('/events/') else None
        util.add_discover_task(source, post_id, type=type)
      else:
        msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME

    elif util.domain_or_parent_in(domain, source.domains):
      synd_links = original_post_discovery.process_entry(source, url, {}, False, [])
      if synd_links:
        for link in synd_links:
          util.add_discover_task(source, source.GR_CLASS.post_id(link))
        source.updates = {'last_syndication_url': util.now_fn()}
        models.Source.put_updates(source)
      else:
        msg = 'Failed to fetch %s or find a %s syndication link.' % (
          util.pretty_link(url), source.GR_CLASS.NAME)

    else:
      msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME

    self.messages.add(msg)
    self.redirect(source.bridgy_url(self))
コード例 #9
0
        def check_params(params):
            req = params['app_engine_http_request']
            if not check_queue(req['relative_uri']):
                return False

            # convert model objects and keys to url-safe key strings for comparison
            for name, val in kwargs.items():
                if isinstance(val, ndb.Model):
                    kwargs[name] = val.key.urlsafe().decode()
                elif isinstance(val, ndb.Key):
                    kwargs[name] = val.urlsafe().decode()

            got = set(urllib.parse.parse_qsl(req['body'].decode()))
            expected = set(kwargs.items())
            if got != expected:
                # print('expect_task: expected %s, got %s' % (expected, got))
                return False

            if eta_seconds is not None:
                got = params['schedule_time'].seconds - util.to_utc_timestamp(
                    util.now_fn())
                delta = eta_seconds * .2 + 10
                if not (got + delta >= eta_seconds >= got - delta):
                    # print('expect_task: expected schedule_time %r, got %r' % (eta_seconds, got))
                    return False

            return True
コード例 #10
0
ファイル: test_app.py プロジェクト: snarfed/bridgy
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
コード例 #11
0
    def post(self):
        source = self.load_source()

        # validate URL, find silo post
        url = util.get_required_param(self, 'url')
        domain = util.domain_from_link(url)
        path = urllib.parse.urlparse(url).path
        msg = 'Discovering now. Refresh in a minute to see the results!'

        if domain == source.GR_CLASS.DOMAIN:
            post_id = source.GR_CLASS.post_id(url)
            if post_id:
                type = 'event' if path.startswith('/events/') else None
                util.add_discover_task(source, post_id, type=type)
            else:
                msg = "Sorry, that doesn't look like a %s post URL." % source.GR_CLASS.NAME

        elif util.domain_or_parent_in(domain, source.domains):
            synd_links = original_post_discovery.process_entry(
                source, url, {}, False, [])
            if synd_links:
                for link in synd_links:
                    util.add_discover_task(source,
                                           source.GR_CLASS.post_id(link))
                source.updates = {'last_syndication_url': util.now_fn()}
                models.Source.put_updates(source)
            else:
                msg = 'Failed to fetch %s or find a %s syndication link.' % (
                    util.pretty_link(url), source.GR_CLASS.NAME)

        else:
            msg = 'Please enter a URL on either your web site or %s.' % source.GR_CLASS.NAME

        self.messages.add(msg)
        self.redirect(source.bridgy_url(self))
コード例 #12
0
def refetch(source):
  """Refetch the author's URLs and look for new or updated syndication
  links that might not have been there the first time we looked.

  Args:
    source: models.Source subclass. Changes to property values (e.g. domains,
      domain_urls, last_syndication_url) are stored in source.updates; they
      should be updated transactionally later.

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPosts
  """
  if not source.updates:
    source.updates = {}

  logging.debug('attempting to refetch h-feed for %s', source.label())
  results = {}
  for url in _get_author_urls(source):
    results.update(_process_author(source, url, refetch=True))

  now = util.now_fn()
  logging.debug('updating source last_hfeed_fetch %s', now)
  source.updates['last_hfeed_fetch'] = now

  return results
コード例 #13
0
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
コード例 #14
0
ファイル: test_cron.py プロジェクト: snarfed/bridgy
  def test_replace_poll_tasks(self):
    now = util.now_fn()

    # a bunch of sources, one needs a new poll task
    five_min_ago = now - datetime.timedelta(minutes=5)
    day_and_half_ago = now - datetime.timedelta(hours=36)
    month_ago = now - datetime.timedelta(days=30)
    defaults = {
      'features': ['listen'],
      'last_webmention_sent': day_and_half_ago,
      }

    self.clear_datastore()
    sources = [
      # doesn't need a new poll task
      FakeSource.new(last_poll_attempt=now, **defaults).put(),
      FakeSource.new(last_poll_attempt=five_min_ago, **defaults).put(),
      FakeSource.new(status='disabled', **defaults).put(),
      FakeSource.new(status='disabled', **defaults).put(),
      # need a new poll task
      FakeSource.new(status='enabled', **defaults).put(),
      # not signed up for listen
      FakeSource.new(last_webmention_sent=day_and_half_ago).put(),
      # never sent a webmention, past grace period. last polled is older than 2x
      # fast poll, but within 2x slow poll.
      FakeSource.new(features=['listen'], created=month_ago,
                     last_poll_attempt=day_and_half_ago).put(),
      ]

    self.expect_task('poll', source_key=sources[4], last_polled='1970-01-01-00-00-00')
    self.mox.ReplayAll()

    resp = self.client.get('/cron/replace_poll_tasks')
    self.assertEqual(200, resp.status_code)
コード例 #15
0
ファイル: test_pages.py プロジェクト: snarfed/bridgy
    def test_discover_url_site_post_syndication_links(self):
        self.expect_requests_get(
            'http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")

        self.expect_task('discover', source_key=self.source, post_id='222')
        self.expect_task('discover', source_key=self.source, post_id='444')
        self.mox.ReplayAll()

        self.assertEqual(0, SyndicatedPost.query().count())
        self.check_discover(
            'http://si.te/123',
            'Discovering now. Refresh in a minute to see the results!')

        self.assertCountEqual([
            {
                'https://fa.ke/222': 'http://si.te/123'
            },
            {
                'https://fa.ke/post/444': 'http://si.te/123'
            },
        ], [{
            sp.syndication: sp.original
        } for sp in models.SyndicatedPost.query()])

        now = util.now_fn()
        source = self.source.key.get()
        self.assertEqual(now, source.last_syndication_url)
コード例 #16
0
    def test_discover_url_site_post_last_feed_syndication_url(self):
        now = util.now_fn()
        self.source.last_feed_syndication_url = now
        self.source.put()

        self.expect_requests_get(
            'http://si.te/123', """
<div class="h-entry">
  <a class="u-syndication" href="http://fa.ke/222"></a>
</div>""")
        self.mox.ReplayAll()

        self.check_discover(
            'http://si.te/123',
            'Discovering now. Refresh in a minute to see the results!')

        tasks = self.taskqueue_stub.GetTasks('discover')
        key = self.source.key.urlsafe()
        self.assertEqual([{
            'source_key': key,
            'post_id': '222'
        }], [testutil.get_task_params(task) for task in tasks])

        source = self.source.key.get()
        self.assertEqual(now, source.last_syndication_url)
コード例 #17
0
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: models.Source subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    sequence of string original post urls, possibly empty
  """
  logging.info('starting posse post discovery with syndicated %s', syndication_url)
  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # TODO: Consider using the actor's url, with get_author_urls() as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in _get_author_urls(source):
      results.update(_process_author(source, url))
    relationships = results.get(syndication_url, [])

    now = util.now_fn()
    logging.debug('updating source last_hfeed_fetch %s', now)
    source.updates['last_hfeed_fetch'] = util.now_fn()

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)

  originals = [r.original for r in relationships if r.original]
  if originals:
    logging.debug('posse post discovery found relationship(s) %s -> %s',
                  syndication_url, originals)
  return originals
コード例 #18
0
ファイル: cron.py プロジェクト: mblaney/bridgy
 def source_query(self):
   now = util.now_fn()
   since_sun = (now.weekday() * datetime.timedelta(days=1) +
                (now - now.replace(hour=0, minute=0, second=0)))
   batch = float(Instagram.query().count()) / self.BATCH
   offset = batch * float(since_sun.total_seconds()) / self.FREQUENCY.total_seconds()
   return Instagram.query().fetch(offset=int(math.floor(offset)),
                                  limit=int(math.ceil(batch)))
コード例 #19
0
 def source_query(self):
   now = util.now_fn()
   since_sun = (now.weekday() * datetime.timedelta(days=1) +
                (now - now.replace(hour=0, minute=0, second=0)))
   batch = float(Instagram.query().count()) / self.BATCH
   offset = batch * float(since_sun.total_seconds()) / self.FREQUENCY.total_seconds()
   return Instagram.query().fetch(offset=int(math.floor(offset)),
                                  limit=int(math.ceil(batch)))
コード例 #20
0
ファイル: tasks.py プロジェクト: mblaney/bridgy
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception, e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code == '401' or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
        if source.is_beta_user():
          util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body)

      elif code in util.HTTP_RATE_LIMIT_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      elif ((code and int(code) / 100 == 5) or
            (code == '400' and isinstance(source, flickr.Flickr)) or
            util.is_connection_failure(e)):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
コード例 #21
0
ファイル: tasks.py プロジェクト: murindwaz/bridgy
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception, e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, exc_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
        if source.is_beta_user():
          util.email_me(subject='Bridgy: disabled %s' % source.label(), body=body)

      elif code in source.RATE_LIMIT_HTTP_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      elif ((code and int(code) / 100 == 5) or
            (code == '400' and isinstance(source, flickr.Flickr)) or
            util.is_connection_failure(e)):
        logging.error('API call failed. Marking as error and finishing. %s: %s\n%s',
                      code, body, e)
        self.abort(util.ERROR_HTTP_RETURN_CODE)
      else:
        raise
コード例 #22
0
ファイル: tasks.py プロジェクト: jamietanna/bridgy
  def post(self, *path_args):
    self.request.headers['Content-Type'] = 'application/x-www-form-urlencoded'
    logging.debug('Params: %s', list(self.request.params.items()))

    key = self.request.params['source_key']
    source = self.source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s', self._last_poll_url(source))

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
      'rate_limited': False,
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except Exception as e:
      source.updates['poll_status'] = 'error'
      code, body = util.interpret_http_exception(e)
      if code in source.DISABLE_HTTP_CODES or isinstance(e, models.DisableSource):
        # the user deauthorized the bridgy app, so disable this source.
        # let the task complete successfully so that it's not retried.
        logging.warning('Disabling source due to: %s' % e, stack_info=True)
        source.updates.update({
          'status': 'disabled',
          'poll_status': 'ok',
        })
        body = '%s\nLast poll: %s' % (source.bridgy_url(self),
                                      self._last_poll_url(source))
      elif code in source.RATE_LIMIT_HTTP_CODES:
        logging.info('Rate limited. Marking as error and finishing. %s', e)
        source.updates['rate_limited'] = True
      else:
        raise
    finally:
      source = models.Source.put_updates(source)

    util.add_poll_task(source)

    # feeble attempt to avoid hitting the instance memory limit
    source = None
    gc.collect()
コード例 #23
0
ファイル: tasks.py プロジェクト: LennonFlores/bridgy
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()
コード例 #24
0
  def should_refetch(self):
    """Returns True if we should run OPD refetch on this source now."""
    now = util.now_fn()
    if self.last_hfeed_refetch == REFETCH_HFEED_TRIGGER:
      return True
    elif not self.last_syndication_url:
      return False

    period = (self.FAST_REFETCH
              if self.last_syndication_url > now - timedelta(days=14)
              else self.SLOW_REFETCH)
    return self.last_poll_attempt >= self.last_hfeed_refetch + period
コード例 #25
0
    def lease(self, key):
        """Attempts to acquire and lease the :class:`models.Webmentions` entity.

    Also loads and sets `g.source`, and returns False if the source doesn't
    exist or is disabled.

    TODO: unify with :meth:`complete()`

    Args:
      key: :class:`ndb.Key`

    Returns: True on success, False or None otherwise
    """
        self.entity = key.get()

        if self.entity is None:
            return self.fail('no entity!')
        elif self.entity.status == 'complete':
            # let this task return 200 and finish
            logger.warning('duplicate task already propagated this')
            return
        elif (self.entity.status == 'processing'
              and util.now_fn() < self.entity.leased_until):
            return self.fail('duplicate task is currently processing!')

        g.source = self.entity.source.get()
        if not g.source or g.source.status == 'disabled':
            logger.error('Source not found or disabled. Dropping task.')
            return False
        logger.info(
            f'Source: {g.source.label()} {g.source.key_id()}, {g.source.bridgy_url()}'
        )

        assert self.entity.status in ('new', 'processing',
                                      'error'), self.entity.status
        self.entity.status = 'processing'
        self.entity.leased_until = util.now_fn() + self.LEASE_LENGTH
        self.entity.put()
        return True
コード例 #26
0
ファイル: tasks.py プロジェクト: tantek/bridgy
  def post(self, *path_args):
    logging.debug('Params: %s', self.request.params)

    key = self.request.params['source_key']
    source = ndb.Key(urlsafe=key).get()
    if not source or source.status == 'disabled' or 'listen' not in source.features:
      logging.error('Source not found or disabled. Dropping task.')
      return
    logging.info('Source: %s %s, %s', source.label(), source.key.string_id(),
                 source.bridgy_url(self))

    last_polled = self.request.params['last_polled']
    if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
      logging.warning('duplicate poll task! deferring to the other task.')
      return

    logging.info('Last poll: %s/log?start_time=%s&key=%s',
                 self.request.host_url,
                 calendar.timegm(source.last_poll_attempt.utctimetuple()),
                 source.key.urlsafe())

    # mark this source as polling
    source.updates = {
      'poll_status': 'polling',
      'last_poll_attempt': util.now_fn(),
    }
    source = models.Source.put_updates(source)

    source.updates = {}
    try:
      self.poll(source)
    except models.DisableSource:
      # the user deauthorized the bridgy app, so disable this source.
      # let the task complete successfully so that it's not retried.
      source.updates['status'] = 'disabled'
      logging.warning('Disabling source!')
    except:
      source.updates['poll_status'] = 'error'
      raise
    finally:
      source = models.Source.put_updates(source)

    # add new poll task. randomize task ETA to within +/- 20% to try to spread
    # out tasks and prevent thundering herds.
    task_countdown = source.poll_period().total_seconds() * random.uniform(.8, 1.2)
    util.add_poll_task(source, countdown=task_countdown)

    # feeble attempt to avoid hitting the instance memory limit
    source = None
    gc.collect()
コード例 #27
0
ファイル: tasks.py プロジェクト: snarfed/bridgy
    def post(self, *path_args):
        logging.debug("Params: %s", self.request.params)

        key = self.request.params["source_key"]
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == "disabled" or "listen" not in source.features:
            logging.error("Source not found or disabled. Dropping task.")
            return
        logging.info("Source: %s %s, %s", source.label(), source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params["last_polled"]
        if last_polled != source.last_polled.strftime(util.POLL_TASK_DATETIME_FORMAT):
            logging.warning("duplicate poll task! deferring to the other task.")
            return

        logging.info(
            "Last poll: %s/log?start_time=%s&key=%s",
            self.request.host_url,
            calendar.timegm(source.last_poll_attempt.utctimetuple()),
            source.key.urlsafe(),
        )

        # mark this source as polling
        source.updates = {"poll_status": "polling", "last_poll_attempt": util.now_fn()}
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except Exception, e:
            source.updates["poll_status"] = "error"
            code, body = util.interpret_http_exception(e)
            if code == "401" or isinstance(e, models.DisableSource):
                # the user deauthorized the bridgy app, so disable this source.
                # let the task complete successfully so that it's not retried.
                logging.warning("Disabling source due to: %s" % e, exc_info=True)
                source.updates.update({"status": "disabled", "poll_status": "ok"})
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning("Rate limited. Marking as error and finishing. %s", e)
                source.updates["rate_limited"] = True
            elif (code and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error("API call failed. Marking as error and finishing. %s: %s\n%s", code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise
コード例 #28
0
ファイル: tasks.py プロジェクト: jamietanna/bridgy
  def record_source_webmention(self, mention):
    """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      mention: :class:`webmentiontools.send.WebmentionSend`
    """
    self.source = self.source.key.get()
    logging.info('Setting last_webmention_sent')
    self.source.last_webmention_sent = util.now_fn()

    if (mention.receiver_endpoint != self.source.webmention_endpoint and
        util.domain_from_link(mention.target_url) in self.source.domains):
      logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                   mention.receiver_endpoint, mention.target_url,
                   self.source.webmention_endpoint)
      self.source.webmention_endpoint = mention.receiver_endpoint

    self.source.put()
コード例 #29
0
    def test_poll_period(self):
        source = FakeSource.new()
        source.put()

        self.assertEqual(source.FAST_POLL, source.poll_period())

        source.created = datetime(2000, 1, 1, tzinfo=timezone.utc)
        self.assertEqual(source.SLOW_POLL, source.poll_period())

        now = util.now_fn()
        source.last_webmention_sent = now - timedelta(days=8)
        self.assertEqual(source.FAST_POLL * 10, source.poll_period())

        source.last_webmention_sent = now
        self.assertEqual(source.FAST_POLL, source.poll_period())

        source.rate_limited = True
        self.assertEqual(source.RATE_LIMITED_POLL, source.poll_period())
コード例 #30
0
ファイル: tasks.py プロジェクト: tantek/bridgy
  def record_source_webmention(self, mention):
    """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      mention: webmentiontools.send.WebmentionSend
    """
    self.source = self.source.key.get()
    logging.info('Setting last_webmention_sent')
    self.source.last_webmention_sent = util.now_fn()

    if (mention.receiver_endpoint != self.source.webmention_endpoint and
        util.domain_from_link(mention.target_url) in self.source.domains):
      logging.info('Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                   mention.receiver_endpoint, mention.target_url,
                   self.source.webmention_endpoint)
      self.source.webmention_endpoint = mention.receiver_endpoint

    self.source.put()
コード例 #31
0
    def record_source_webmention(self, endpoint, target):
        """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      endpoint: str, URL
      target: str, URL
    """
        g.source = g.source.key.get()
        logger.info('Setting last_webmention_sent')
        g.source.last_webmention_sent = util.now_fn()

        if (endpoint != g.source.webmention_endpoint
                and util.domain_from_link(target) in g.source.domains):
            logger.info(
                f'Also setting webmention_endpoint to {endpoint} (discovered in {target}; was {g.source.webmention_endpoint})'
            )
            g.source.webmention_endpoint = endpoint

        g.source.put()
コード例 #32
0
ファイル: tasks.py プロジェクト: Tiamat-Tech/bridgy
    def record_source_webmention(self, endpoint, target):
        """Sets this source's last_webmention_sent and maybe webmention_endpoint.

    Args:
      endpoint: str, URL
      target: str, URL
    """
        self.source = self.source.key.get()
        logging.info('Setting last_webmention_sent')
        self.source.last_webmention_sent = util.now_fn()

        if (endpoint != self.source.webmention_endpoint
                and util.domain_from_link(target) in self.source.domains):
            logging.info(
                'Also setting webmention_endpoint to %s (discovered in %s; was %s)',
                endpoint, target, self.source.webmention_endpoint)
            self.source.webmention_endpoint = endpoint

        self.source.put()
コード例 #33
0
  def poll_period(self):
    """Returns the poll frequency for this source, as a :class:`datetime.timedelta`.

    Defaults to ~15m, depending on silo. If we've never sent a webmention for
    this source, or the last one we sent was over a month ago, we drop them down
    to ~1d after a week long grace period.
    """
    now = util.now_fn()
    if self.rate_limited:
      return self.RATE_LIMITED_POLL
    elif now < self.created + self.FAST_POLL_GRACE_PERIOD:
      return self.FAST_POLL
    elif not self.last_webmention_sent:
      return self.SLOW_POLL
    elif self.last_webmention_sent > now - timedelta(days=7):
      return self.FAST_POLL
    elif self.last_webmention_sent > now - timedelta(days=30):
      return self.FAST_POLL * 10
    else:
      return self.SLOW_POLL
コード例 #34
0
ファイル: test_pages.py プロジェクト: snarfed/bridgy
    def test_discover_url_site_post_last_feed_syndication_url(self):
        now = util.now_fn()
        self.source.last_feed_syndication_url = now
        self.source.put()

        self.expect_requests_get(
            'http://si.te/123', """
<div class="h-entry">
  <a class="u-syndication" href="http://fa.ke/222"></a>
</div>""")

        self.expect_task('discover', source_key=self.source, post_id='222')
        self.mox.ReplayAll()

        self.check_discover(
            'http://si.te/123',
            'Discovering now. Refresh in a minute to see the results!')

        source = self.source.key.get()
        self.assertEqual(now, source.last_syndication_url)
コード例 #35
0
ファイル: test_app.py プロジェクト: snarfed/bridgy
  def test_discover_url_site_post_last_feed_syndication_url(self):
    now = util.now_fn()
    self.source.last_feed_syndication_url = now
    self.source.put()

    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  <a class="u-syndication" href="http://fa.ke/222"></a>
</div>""")
    self.mox.ReplayAll()

    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([{'source_key': key, 'post_id': '222'}],
                     [testutil.get_task_params(task) for task in tasks])

    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
コード例 #36
0
ファイル: admin.py プロジェクト: snarfed/bridgy
def responses():
  """Find the most recently attempted responses and blog posts with error URLs."""
  entities = []

  for cls in (Response,):  # BlogPost
    for e in cls.query().order(-cls.updated):
      if (len(entities) >= NUM_ENTITIES or
          e.updated < util.now_fn() - datetime.timedelta(hours=1)):
        break
      elif (not e.error and not e.unsent) or e.status == 'complete':
        continue

      e.links = [util.pretty_link(u, new_tab=True) for u in e.error + e.failed]
      if e.key.kind() == 'Response':
        e.response = json_loads(e.response_json)
        e.activities = [json_loads(a) for a in e.activities_json]
      else:
        e.response = {'content': '[BlogPost]'}
        e.activities = [{'url': e.key.id()}]

      entities.append(e)

  return render_template('admin_responses.html', responses=entities, logs=logs)
コード例 #37
0
ファイル: testutil.py プロジェクト: kevincox/bridgy
        def check_task(task):
            if not task.parent.endswith('/' + queue):
                # These can help for debugging, but can also be misleading, since many
                # tests insert multiple tasks, so check_task() runs on all of them (due
                # to InAnyOrder() below) until it finds one that matches.
                # print("expect_task: %s doesn't end with /%s!" % (task.parent, queue))
                return False

            req = task.task.app_engine_http_request
            if not req.relative_uri.endswith('/' + queue):
                # print("expect_task: relative_uri %s doesn't end with /%s!" % (
                #   req.relative_uri, queue))
                return False

            # convert model objects and keys to url-safe key strings for comparison
            for name, val in kwargs.items():
                if isinstance(val, ndb.Model):
                    kwargs[name] = val.key.urlsafe().decode()
                elif isinstance(val, ndb.Key):
                    kwargs[name] = val.urlsafe().decode()

            got = set(urllib.parse.parse_qsl(req.body.decode()))
            expected = set(kwargs.items())
            if got != expected:
                # print('expect_task: expected %s, got %s' % (expected, got))
                return False

            if eta_seconds is not None:
                got = (util.to_utc_timestamp(task.task.schedule_time) -
                       util.to_utc_timestamp(util.now_fn()))
                delta = eta_seconds * .2 + 10
                if not (got + delta >= eta_seconds >= got - delta):
                    # print('expect_task: expected schedule_time %r, got %r' % (eta_seconds, got))
                    return False

            return True
コード例 #38
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = util.beautifulsoup_parse(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.info('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if feed_type and feed_type != 'text/html':
      feed_ok = False
    else:
      # double check that it's text/html, not too big, etc
      feed_url, _, feed_ok = util.get_webmention_target(feed_url)

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True)

  # sort by dt-updated/dt-published
  def updated_or_published(item):
    props = microformats2.first_props(item.get('properties'))
    return props.get('updated') or props.get('published')

  feeditems.sort(key=updated_or_published, reverse=True)

  permalink_to_entry = collections.OrderedDict()
  for child in feeditems:
    if 'h-entry' in child['type']:
      permalinks = child['properties'].get('url', [])
      if not permalinks:
        logging.debug('ignoring h-entry with no u-url!')
      for permalink in permalinks:
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

    max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user()
           else MAX_PERMALINK_FETCHES)
    if len(permalink_to_entry) >= max:
      logging.info('Hit cap of %d permalinks. Stopping.', max)
      break

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    source.updates['last_syndication_url'] = util.now_fn()

  return results
コード例 #39
0
def process_entry(source, permalink, feed_entry, refetch, preexisting,
                  store_blanks=True):
  """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
  # if the post has already been processed, do not add to the results
  # since this method only returns *newly* discovered relationships.
  if preexisting:
    # if we're refetching and this one is blank, do not return.
    # if there is a blank entry, it should be the one and only entry,
    # but go ahead and check 'all' of them to be safe.
    if not refetch:
      return {}
    synds = [s.syndication for s in preexisting if s.syndication]
    if synds:
      logging.debug('previously found relationship(s) for original %s: %s',
                    permalink, synds)

  # first try with the h-entry from the h-feed. if we find the syndication url
  # we're looking for, we don't have to fetch the permalink
  permalink, _, type_ok = util.get_webmention_target(permalink)
  usynd = feed_entry.get('properties', {}).get('syndication', [])
  if usynd:
    logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
  results = _process_syndication_urls(source, permalink, set(
    url for url in usynd if isinstance(url, basestring)), preexisting)
  success = True

  if results:
    source.updates['last_feed_syndication_url'] = util.now_fn()
  elif not source.last_feed_syndication_url or not feed_entry:
    # fetch the full permalink page if we think it might have more details
    parsed = None
    try:
      logging.debug('fetching post permalink %s', permalink)
      if type_ok:
        resp = util.requests_get(permalink)
        resp.raise_for_status()
        parsed = util.mf2py_parse(resp.text, permalink)
    except AssertionError:
      raise  # for unit tests
    except BaseException:
      # TODO limit the number of allowed failures
      logging.info('Could not fetch permalink %s', permalink, exc_info=True)
      success = False

    if parsed:
      syndication_urls = set()
      relsynd = parsed.get('rels').get('syndication', [])
      if relsynd:
        logging.debug('rel-syndication links: %s', relsynd)
      syndication_urls.update(url for url in relsynd
                              if isinstance(url, basestring))
      # there should only be one h-entry on a permalink page, but
      # we'll check all of them just in case.
      for hentry in (item for item in parsed['items']
                     if 'h-entry' in item['type']):
        usynd = hentry.get('properties', {}).get('syndication', [])
        if usynd:
          logging.debug('u-syndication links: %s', usynd)
        syndication_urls.update(url for url in usynd
                                if isinstance(url, basestring))
      results = _process_syndication_urls(
        source, permalink, syndication_urls, preexisting)

  # detect and delete SyndicatedPosts that were removed from the site
  if success:
    result_syndposts = itertools.chain(*results.values())
    for syndpost in list(preexisting):
      if syndpost.syndication and syndpost not in result_syndposts:
        logging.info('deleting relationship that disappeared: %s', syndpost)
        syndpost.key.delete()
        preexisting.remove(syndpost)

  if not results:
    logging.debug('no syndication links from %s to current source %s.',
                  permalink, source.label())
    results = {}
    if store_blanks and not preexisting:
      # remember that this post doesn't have syndication links for this
      # particular source
      logging.debug('saving empty relationship so that %s will not be '
                    'searched again', permalink)
      SyndicatedPost.insert_original_blank(source, permalink)

  # only return results that are not in the preexisting list
  new_results = {}
  for syndurl, syndposts_for_url in results.iteritems():
    for syndpost in syndposts_for_url:
      if syndpost not in preexisting:
        new_results.setdefault(syndurl, []).append(syndpost)

  if new_results:
    logging.debug('discovered relationships %s', new_results)
  return new_results
コード例 #40
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logger.debug(
                f'previously found relationship(s) for original {permalink}: {synds}'
            )

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    usynd_urls = {url for url in usynd if isinstance(url, str)}
    if usynd_urls:
        logger.debug(
            f'u-syndication links on the h-feed h-entry: {usynd_urls}')
    results = _process_syndication_urls(source, permalink, usynd_urls,
                                        preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        mf2 = None
        try:
            if type_ok:
                logger.debug(f'fetching post permalink {permalink}')
                mf2 = util.fetch_mf2(permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logger.info(f'Could not fetch permalink {permalink}',
                        exc_info=True)
            success = False

        if mf2:
            syndication_urls = set()
            relsynd = mf2['rels'].get('syndication', [])
            if relsynd:
                logger.debug(f'rel-syndication links: {relsynd}')
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, str))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in mf2['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logger.debug(f'u-syndication links: {usynd}')
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, str))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = list(itertools.chain(*results.values()))
        for syndpost in preexisting:
            if syndpost.syndication and syndpost not in result_syndposts:
                logger.info(
                    f'deleting relationship that disappeared: {syndpost}')
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logger.debug(
            f'no syndication links from {permalink} to current source {source.label()}.'
        )
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logger.debug(
                f'saving empty relationship so that {permalink} will not be searched again'
            )
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.items():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logger.debug(f'discovered relationships {new_results}')
    return new_results
コード例 #41
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    logger.debug(f'fetching author url {author_url}')
    try:
        author_mf2 = util.fetch_mf2(author_url)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logger.info(f'Could not fetch author url {author_url}', exc_info=True)
        return {}

    feeditems = _find_feed_items(author_mf2)

    # try rel=feeds and rel=alternates
    feed_urls = set()
    candidates = (author_mf2['rels'].get('feed', []) + [
        a.get('url') for a in author_mf2.get('alternates', [])
        if a.get('type') == MF2_HTML_MIME_TYPE
    ])
    for feed_url in candidates:
        # check that it's html, not too big, etc
        feed_url, _, feed_ok = util.get_webmention_target(feed_url)
        if feed_url == author_url:
            logger.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logger.debug("skipping feed since it's not HTML or otherwise bad")
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logger.debug(f"fetching author's rel-feed {feed_url}")
            feed_mf2 = util.fetch_mf2(feed_url)
            feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2))
            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logger.info(
                        f'rel-feed found new domain {domain}! adding to source'
                    )
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logger.info(f'Could not fetch h-feed url {feed_url}.',
                        exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published') or ''

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logger.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, str):
                    permalink_to_entry[permalink] = child
                else:
                    logger.warning(
                        f'unexpected non-string "url" property: {permalink}')

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logger.info(f'Hit cap of {max} permalinks. Stopping.')
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.items():
        logger.debug(f'processing permalink: {permalink}')
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.items():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
コード例 #42
0
  def template_vars(self):
    vars = super(UserHandler, self).template_vars()
    vars.update({
        'source': self.source,
        'EPOCH': util.EPOCH,
        'REFETCH_HFEED_TRIGGER': models.REFETCH_HFEED_TRIGGER,
        'RECENT_PRIVATE_POSTS_THRESHOLD': RECENT_PRIVATE_POSTS_THRESHOLD,
        })
    if not self.source:
      return vars

    if isinstance(self.source, instagram.Instagram):
      auth = self.source.auth_entity
      vars['indieauth_me'] = (
        auth.id if isinstance(auth, indieauth.IndieAuth)
        else self.source.domain_urls[0] if self.source.domain_urls
        else None)

    # Blog webmention promos
    if 'webmention' not in self.source.features:
      if self.source.SHORT_NAME in ('blogger', 'medium', 'tumblr', 'wordpress'):
        vars[self.source.SHORT_NAME + '_promo'] = True
      else:
        for domain in self.source.domains:
          if ('.blogspot.' in domain and  # Blogger uses country TLDs
              not Blogger.query(Blogger.domains == domain).get()):
            vars['blogger_promo'] = True
          elif (domain.endswith('tumblr.com') and
                not Tumblr.query(Tumblr.domains == domain).get()):
            vars['tumblr_promo'] = True
          elif (domain.endswith('wordpress.com') and
                not WordPress.query(WordPress.domains == domain).get()):
            vars['wordpress_promo'] = True

    # Responses
    if 'listen' in self.source.features:
      vars['responses'] = []
      query = Response.query().filter(Response.source == self.source.key)

      # if there's a paging param (responses_before or responses_after), update
      # query with it
      def get_paging_param(param):
        val = self.request.get(param)
        try:
          return util.parse_iso8601(val) if val else None
        except:
          msg = "Couldn't parse %s %r as ISO8601" % (param, val)
          logging.exception(msg)
          self.abort(400, msg)

      before = get_paging_param('responses_before')
      after = get_paging_param('responses_after')
      if before and after:
        self.abort(400, "can't handle both responses_before and responses_after")
      elif after:
        query = query.filter(Response.updated > after).order(Response.updated)
      elif before:
        query = query.filter(Response.updated < before).order(-Response.updated)
      else:
        query = query.order(-Response.updated)

      query_iter = query.iter()
      for i, r in enumerate(query_iter):
        r.response = json.loads(r.response_json)
        r.activities = [json.loads(a) for a in r.activities_json]

        if (not self.source.is_activity_public(r.response) or
            not all(self.source.is_activity_public(a) for a in r.activities)):
          continue
        elif r.type == 'post':
          r.activities = []

        r.actor = r.response.get('author') or r.response.get('actor', {})

        for a in r.activities + [r.response]:
          if not a.get('content'):
            a['content'] = a.get('object', {}).get('content')

        if not r.response.get('content'):
          phrases = {
            'like': 'liked this',
            'repost': 'reposted this',
            'rsvp-yes': 'is attending',
            'rsvp-no': 'is not attending',
            'rsvp-maybe': 'might attend',
            'rsvp-interested': 'is interested',
            'invite': 'is invited',
          }
          r.response['content'] = '%s %s.' % (
            r.actor.get('displayName') or '',
            phrases.get(r.type) or phrases.get(r.response.get('verb')))

        # convert image URL to https if we're serving over SSL
        image_url = r.actor.setdefault('image', {}).get('url')
        if image_url:
          r.actor['image']['url'] = util.update_scheme(image_url, self)

        # generate original post links
        r.links = self.process_webmention_links(r)
        r.original_links = [util.pretty_link(url, new_tab=True)
                            for url in r.original_posts]

        vars['responses'].append(r)
        if len(vars['responses']) >= 10 or i > 200:
          break

      vars['responses'].sort(key=lambda r: r.updated, reverse=True)

      # calculate new paging param(s)
      new_after = (
        before if before else
        vars['responses'][0].updated if
          vars['responses'] and query_iter.probably_has_next() and (before or after)
        else None)
      if new_after:
        vars['responses_after_link'] = ('?responses_after=%s#responses' %
                                         new_after.isoformat())

      new_before = (
        after if after else
        vars['responses'][-1].updated if
          vars['responses'] and query_iter.probably_has_next()
        else None)
      if new_before:
        vars['responses_before_link'] = ('?responses_before=%s#responses' %
                                         new_before.isoformat())

      vars['next_poll'] = max(
        self.source.last_poll_attempt + self.source.poll_period(),
        # lower bound is 1 minute from now
        util.now_fn() + datetime.timedelta(seconds=90))

    # Publishes
    if 'publish' in self.source.features:
      publishes = Publish.query().filter(Publish.source == self.source.key)\
                                 .order(-Publish.updated)\
                                 .fetch(10)
      for p in publishes:
        p.pretty_page = util.pretty_link(
          p.key.parent().id().decode('utf-8'),
          attrs={'class': 'original-post u-url u-name'},
          new_tab=True)

      vars['publishes'] = publishes

    if 'webmention' in self.source.features:
      # Blog posts
      blogposts = BlogPost.query().filter(BlogPost.source == self.source.key)\
                                  .order(-BlogPost.created)\
                                  .fetch(10)
      for b in blogposts:
        b.links = self.process_webmention_links(b)
        try:
          text = b.feed_item.get('title')
        except ValueError:
          text = None
        b.pretty_url = util.pretty_link(
          b.key.id(), text=text, attrs={'class': 'original-post u-url u-name'},
          max_length=40, new_tab=True)

      # Blog webmentions
      webmentions = BlogWebmention.query()\
          .filter(BlogWebmention.source == self.source.key)\
          .order(-BlogWebmention.updated)\
          .fetch(10)
      for w in webmentions:
        w.pretty_source = util.pretty_link(
          w.source_url(), attrs={'class': 'original-post'}, new_tab=True)
        try:
          target_is_source = (urlparse.urlparse(w.target_url()).netloc in
                              self.source.domains)
        except BaseException:
          target_is_source = False
        w.pretty_target = util.pretty_link(
          w.target_url(), attrs={'class': 'original-post'}, new_tab=True,
          keep_host=target_is_source)

      vars.update({'blogposts': blogposts, 'webmentions': webmentions})

    return vars
コード例 #43
0
ファイル: tasks.py プロジェクト: mblaney/bridgy
  def poll(self, source):
    """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
    if source.last_activities_etag or source.last_activity_id:
      logging.debug('Using ETag %s, last activity id %s',
                    source.last_activities_etag, source.last_activity_id)

    #
    # Step 1: fetch activities:
    # * posts by the user
    # * search all posts for the user's domain URLs to find links
    #
    cache = util.CacheDict()
    if source.last_activities_cache_json:
      cache.update(json.loads(source.last_activities_cache_json))

    # search for links first so that the user's activities and responses
    # override them if they overlap
    links = source.search_for_links()

    # this user's own activities (and user mentions)
    resp = source.get_activities_response(
      fetch_replies=True, fetch_likes=True, fetch_shares=True,
      fetch_mentions=True, count=50, etag=source.last_activities_etag,
      min_id=source.last_activity_id, cache=cache)
    etag = resp.get('etag')  # used later
    user_activities = resp.get('items', [])

    # these map ids to AS objects
    responses = {a['id']: a for a in links}
    activities = {a['id']: a for a in links + user_activities}

    # extract silo activity ids, update last_activity_id
    silo_activity_ids = set()
    last_activity_id = source.last_activity_id
    for id, activity in activities.items():
      # maybe replace stored last activity id
      parsed = util.parse_tag_uri(id)
      if parsed:
        id = parsed[1]
      silo_activity_ids.add(id)
      try:
        # try numeric comparison first
        greater = int(id) > int(last_activity_id)
      except (TypeError, ValueError):
        greater = id > last_activity_id
      if greater:
        last_activity_id = id

    if last_activity_id and last_activity_id != source.last_activity_id:
      source.updates['last_activity_id'] = last_activity_id

    # trim cache to just the returned activity ids, so that it doesn't grow
    # without bound. (WARNING: depends on get_activities_response()'s cache key
    # format, e.g. 'PREFIX ACTIVITY_ID'!)
    source.updates['last_activities_cache_json'] = json.dumps(
      {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids})

    self.backfeed(source, responses, activities=activities)

    source.updates.update({'last_polled': source.last_poll_attempt,
                           'poll_status': 'ok'})
    if etag and etag != source.last_activities_etag:
      source.updates['last_activities_etag'] = etag

    #
    # Possibly refetch updated syndication urls.
    #
    # if the author has added syndication urls since the first time
    # original_post_discovery ran, we'll miss them. this cleanup task will
    # periodically check for updated urls. only kicks in if the author has
    # *ever* published a rel=syndication url
    if source.should_refetch():
      logging.info('refetching h-feed for source %s', source.label())
      relationships = original_post_discovery.refetch(source)

      now = util.now_fn()
      source.updates['last_hfeed_refetch'] = now

      if relationships:
        logging.info('refetch h-feed found new rel=syndication relationships: %s',
                     relationships)
        try:
          self.repropagate_old_responses(source, relationships)
        except BaseException, e:
          if (isinstance(e, (datastore_errors.BadRequestError,
                             datastore_errors.Timeout)) or
              util.is_connection_failure(e)):
            logging.info('Timeout while repropagating responses.', exc_info=True)
          else:
            raise
コード例 #44
0
ファイル: tasks.py プロジェクト: snarfed/bridgy
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise
コード例 #45
0
ファイル: app.py プロジェクト: LennonFlores/bridgy
  def template_vars(self):
    vars = super(UserHandler, self).template_vars()
    vars.update({
        'source': self.source,
        'EPOCH': util.EPOCH,
        'REFETCH_HFEED_TRIGGER': models.REFETCH_HFEED_TRIGGER,
        'RECENT_PRIVATE_POSTS_THRESHOLD': RECENT_PRIVATE_POSTS_THRESHOLD,
        })
    if not self.source:
      return vars

    if isinstance(self.source, instagram.Instagram):
      auth = self.source.auth_entity
      vars['indieauth_me'] = (
        auth.id if isinstance(auth, indieauth.IndieAuth)
        else self.source.domain_urls[0] if self.source.domain_urls
        else None)

    # Blog webmention promos
    if 'webmention' not in self.source.features:
      if self.source.SHORT_NAME in ('blogger', 'tumblr', 'wordpress'):
        vars[self.source.SHORT_NAME + '_promo'] = True
      else:
        for domain in self.source.domains:
          if ('.blogspot.' in domain and  # Blogger uses country TLDs
              not Blogger.query(Blogger.domains == domain).get()):
            vars['blogger_promo'] = True
          elif (domain.endswith('tumblr.com') and
                not Tumblr.query(Tumblr.domains == domain).get()):
            vars['tumblr_promo'] = True
          elif (domain.endswith('wordpress.com') and
                not WordPress.query(WordPress.domains == domain).get()):
            vars['wordpress_promo'] = True

    # Responses
    if 'listen' in self.source.features:
      vars['responses'] = []
      query = Response.query().filter(Response.source == self.source.key)

      # if there's a paging param (responses_before or responses_after), update
      # query with it
      def get_paging_param(param):
        val = self.request.get(param)
        try:
          return util.parse_iso8601(val) if val else None
        except:
          msg = "Couldn't parse %s %r as ISO8601" % (param, val)
          logging.exception(msg)
          self.abort(400, msg)

      before = get_paging_param('responses_before')
      after = get_paging_param('responses_after')
      if before and after:
        self.abort(400, "can't handle both responses_before and responses_after")
      elif after:
        query = query.filter(Response.updated > after).order(Response.updated)
      elif before:
        query = query.filter(Response.updated < before).order(-Response.updated)
      else:
        query = query.order(-Response.updated)

      query_iter = query.iter()
      for i, r in enumerate(query_iter):
        r.response = json.loads(r.response_json)
        r.activities = [json.loads(a) for a in r.activities_json]

        if (not self.source.is_activity_public(r.response) or
            not all(self.source.is_activity_public(a) for a in r.activities)):
          continue
        elif r.type == 'post':
          r.activities = []

        r.actor = r.response.get('author') or r.response.get('actor', {})

        for a in r.activities + [r.response]:
          if not a.get('content'):
            a['content'] = a.get('object', {}).get('content')

        if not r.response.get('content'):
          phrases = {
            'like': 'liked this',
            'repost': 'reposted this',
            'rsvp-yes': 'is attending',
            'rsvp-no': 'is not attending',
            'rsvp-maybe': 'might attend',
            'rsvp-interested': 'is interested',
            'invite': 'is invited',
          }
          r.response['content'] = '%s %s.' % (
            r.actor.get('displayName') or '',
            phrases.get(r.type) or phrases.get(r.response.get('verb')))

        # convert image URL to https if we're serving over SSL
        image_url = r.actor.setdefault('image', {}).get('url')
        if image_url:
          r.actor['image']['url'] = util.update_scheme(image_url, self)

        # generate original post links
        r.links = self.process_webmention_links(r)
        r.original_links = [util.pretty_link(url, new_tab=True)
                            for url in r.original_posts]

        vars['responses'].append(r)
        if len(vars['responses']) >= 10 or i > 200:
          break

      vars['responses'].sort(key=lambda r: r.updated, reverse=True)

      # calculate new paging param(s)
      new_after = (
        before if before else
        vars['responses'][0].updated if
          vars['responses'] and query_iter.probably_has_next() and (before or after)
        else None)
      if new_after:
        vars['responses_after_link'] = ('?responses_after=%s#responses' %
                                         new_after.isoformat())

      new_before = (
        after if after else
        vars['responses'][-1].updated if
          vars['responses'] and query_iter.probably_has_next()
        else None)
      if new_before:
        vars['responses_before_link'] = ('?responses_before=%s#responses' %
                                         new_before.isoformat())

      vars['next_poll'] = max(
        self.source.last_poll_attempt + self.source.poll_period(),
        # lower bound is 1 minute from now
        util.now_fn() + datetime.timedelta(seconds=90))

    # Publishes
    if 'publish' in self.source.features:
      publishes = Publish.query().filter(Publish.source == self.source.key)\
                                 .order(-Publish.updated)\
                                 .fetch(10)
      for p in publishes:
        p.pretty_page = util.pretty_link(
          p.key.parent().id(), attrs={'class': 'original-post u-url u-name'},
          new_tab=True)

      vars['publishes'] = publishes

    if 'webmention' in self.source.features:
      # Blog posts
      blogposts = BlogPost.query().filter(BlogPost.source == self.source.key)\
                                  .order(-BlogPost.created)\
                                  .fetch(10)
      for b in blogposts:
        b.links = self.process_webmention_links(b)
        try:
          text = b.feed_item.get('title')
        except ValueError:
          text = None
        b.pretty_url = util.pretty_link(
          b.key.id(), text=text, attrs={'class': 'original-post u-url u-name'},
          max_length=40, new_tab=True)

      # Blog webmentions
      webmentions = BlogWebmention.query()\
          .filter(BlogWebmention.source == self.source.key)\
          .order(-BlogWebmention.updated)\
          .fetch(10)
      for w in webmentions:
        w.pretty_source = util.pretty_link(
          w.source_url(), attrs={'class': 'original-post'}, new_tab=True)
        try:
          target_is_source = (urlparse.urlparse(w.target_url()).netloc in
                              self.source.domains)
        except BaseException:
          target_is_source = False
        w.pretty_target = util.pretty_link(
          w.target_url(), attrs={'class': 'original-post'}, new_tab=True,
          keep_host=target_is_source)

      vars.update({'blogposts': blogposts, 'webmentions': webmentions})

    return vars
コード例 #46
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json_loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(fetch_replies=True,
                                              fetch_likes=True,
                                              fetch_shares=True,
                                              fetch_mentions=True,
                                              count=50,
                                              etag=source.last_activities_etag,
                                              min_id=source.last_activity_id,
                                              cache=cache)
        etag = resp.get('etag')  # used later
        user_activities = resp.get('items', [])

        # these map ids to AS objects
        responses = {a['id']: a for a in links}
        activities = {a['id']: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = str(id) > str(last_activity_id)
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json_dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        self.backfeed(source, responses, activities=activities)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Possibly refetch updated syndication urls.
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException as e:
                    if ('BadRequestError' in str(e.__class__)
                            or 'Timeout' in str(e.__class__)
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     stack_info=True)
                    else:
                        raise
        else:
            logging.info(
                'skipping refetch h-feed. last-syndication-url %s, last-refetch %s',
                source.last_syndication_url, source.last_hfeed_refetch)
コード例 #47
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = BeautifulSoup(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url,
                      exc_info=True)

  permalink_to_entry = {}
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = _process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = util.now_fn()
    logging.debug('updating source last_syndication_url %s', now)
    source.updates['last_syndication_url'] = now

  return results
コード例 #48
0
ファイル: tasks.py プロジェクト: LennonFlores/bridgy
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise