def test_rel_feed_anchor(self):
        """Check that we follow the rel=feed when it's in an <a> tag instead of <link>
    """
        self.expect_requests_get(
            "http://author",
            """
    <html>
      <head>
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
      <body>
        <a href="try_this.html" rel="feed">full unfiltered feed</a>
      </body>
    </html>""",
        )

        self.expect_requests_get(
            "http://author/try_this.html",
            """
    <html class="h-feed">
      <body>
        <div class="h-entry">Hi</div>
      </body>
    </html>""",
        )

        self.mox.ReplayAll()
        discover(self.source, self.activity)
    def test_rel_feed_link(self):
        """Check that we follow the rel=feed link when looking for the
    author's full feed URL
    """
        self.expect_requests_get(
            "http://author",
            """
    <html>
      <head>
        <link rel="feed" type="text/html" href="try_this.html">
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
    </html>""",
        )

        self.expect_requests_get(
            "http://author/try_this.html",
            """
    <html class="h-feed">
      <body>
        <div class="h-entry">Hi</div>
      </body>
    </html>""",
        )

        self.mox.ReplayAll()
        discover(self.source, self.activity)
  def test_avoid_permalink_with_bad_content_type(self):
    """Confirm that we don't follow u-url's that lead to anything that
    isn't text/html (e.g., PDF)
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    # head request to follow redirects on the post url
    self.expect_requests_head(activity['object']['url'])

    self.expect_requests_head('http://author')
    self.expect_requests_get('http://author', """
    <html>
      <body>
        <div class="h-entry">
          <a href="http://scholarly.com/paper.pdf">An interesting paper</a>
        </div>
      </body>
    </html>
    """)

    # and to check the content-type of the article
    self.expect_requests_head('http://scholarly.com/paper.pdf',
                              response_headers={
                                'content-type': 'application/pdf'
                              })

    # call to requests.get for permalink should be skipped

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)
Exemple #4
0
def get_webmention_targets(source, activity):
  """Returns a set of string target URLs to attempt to send webmentions to.

  Side effect: runs the original post discovery algorithm on the activity and
  adds the resulting URLs to the activity as tags, in place.

  Args:
   source: models.Source subclass
   activity: activity dict
  """
  original_post_discovery.discover(source, activity)

  targets = set()
  obj = activity.get('object') or activity

  for tag in obj.get('tags', []):
    url = tag.get('url')
    if url and tag.get('objectType') == 'article':
      url, domain, send = util.get_webmention_target(url)
      tag['url'] = url
      if send:
        targets.add(url)

  for url in obj.get('upstreamDuplicates', []):
    url, domain, send = util.get_webmention_target(url)
    if send:
      targets.add(url)

  return targets
    def _test_failed_post_permalink_fetch(self, raise_exception):
        """Make sure something reasonable happens when we're unable to fetch
    the permalink of an entry linked in the h-feed
    """
        self.expect_requests_get(
            "http://author",
            """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="nonexistent.html"></a>
      </article>
    </html>
    """,
        )

        if raise_exception:
            self.expect_requests_get("http://author/nonexistent.html").AndRaise(HTTPError())
        else:
            self.expect_requests_get("http://author/nonexistent.html", status_code=410)

        self.mox.ReplayAll()
        discover(self.source, self.activity)
        # we should have saved placeholders to prevent us from trying the
        # syndication url or permalink again
        self.assert_syndicated_posts(("http://author/nonexistent.html", None), (None, "https://fa.ke/post/url"))
  def test_existing_syndicated_posts(self):
    """Confirm that no additional requests are made if we already have a
    SyndicatedPost in the DB.
    """
    original_url = 'http://author/notes/2014/04/24/1'
    syndication_url = 'https://fa.ke/post/url'

    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = syndication_url
    activity['object']['content'] = 'content without links'

    # save the syndicated post ahead of time (as if it had been
    # discovered previously)
    SyndicatedPost(parent=source.key, original=original_url,
                   syndication=syndication_url).put()

    self.mox.ReplayAll()

    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    # should append the author note url, with no addt'l requests
    self.assertEquals([original_url], activity['object']['upstreamDuplicates'])
  def test_match_facebook_username_url(self):
    """Facebook URLs use username and user id interchangeably, and one
    does not redirect to the other. Make sure we can still find the
    relationship if author's publish syndication links using their
    username
    """
    source = FacebookPage.new(self.handler, auth_entity=self.auth_entity)
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    # facebook activity comes to us with the numeric id
    activity['object']['url'] = 'http://facebook.com/212038/posts/314159'
    activity['object']['content'] = 'content without links'

    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>""")

    # user sensibly publishes syndication link using their name
    self.expect_requests_get('http://author/post/permalink', """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
      <a class="u-syndication" href="http://facebook.com/snarfed.org/posts/314159"></a>
    </html>""")

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    self.assertEquals(['http://author/post/permalink'],
                      activity['object']['upstreamDuplicates'])
  def _test_failed_rel_feed_link_fetch(self, raise_exception):
    """An author page with an invalid rel=feed link. We should recover and
    use any h-entries on the main url as a fallback.
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" type="text/html" href="try_this.html">
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
      <body>
        <div class="h-entry">
          <a class="u-url" href="recover_and_fetch_this.html"></a>
        </div>
      </body>
    </html>""")

    # try to do this and fail
    if raise_exception:
      self.expect_requests_get('http://author/try_this.html').AndRaise(HTTPError())
    else:
      self.expect_requests_get('http://author/try_this.html', status_code=404)

    # despite the error, should fallback on the main page's h-entries and
    # check the permalink
    self.expect_requests_get('http://author/recover_and_fetch_this.html')

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)
  def test_match_facebook_username_url(self):
    """Facebook URLs use username and user id interchangeably, and one
    does not redirect to the other. Make sure we can still find the
    relationship if author's publish syndication links using their
    username
    """
    auth_entity = oauth_facebook.FacebookAuth(
      id='my_string_id', auth_code='my_code', access_token_str='my_token',
      user_json=json.dumps({'id': '212038', 'username': '******'}))
    auth_entity.put()

    source = FacebookPage.new(self.handler, auth_entity=auth_entity,
                              domain_urls=['http://author'])
    # facebook activity comes to us with the numeric id
    self.activity['object']['url'] = 'http://facebook.com/212038/posts/314159'

    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>""")

    # user sensibly publishes syndication link using their username
    self.expect_requests_get('http://author/post/permalink', """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
      <a class="u-syndication" href="http://facebook.com/snarfed.org/posts/314159"></a>
    </html>""")

    self.mox.ReplayAll()
    original_post_discovery.discover(source, self.activity)

    self.assertEquals(['http://author/post/permalink'],
                      self.activity['object']['upstreamDuplicates'])
    def test_avoid_permalink_with_bad_content_type(self):
        """Confirm that we don't follow u-url's that lead to anything that
    isn't text/html (e.g., PDF)
    """
        # head request to follow redirects on the post url
        self.expect_requests_head(self.activity["object"]["url"])
        self.expect_requests_head("http://author")
        self.expect_requests_get(
            "http://author",
            """
    <html>
      <body>
        <div class="h-entry">
          <a href="http://scholarly.com/paper.pdf">An interesting paper</a>
        </div>
      </body>
    </html>
    """,
        )

        # and to check the content-type of the article
        self.expect_requests_head(
            "http://scholarly.com/paper.pdf", response_headers={"content-type": "application/pdf"}
        )

        # call to requests.get for permalink should be skipped
        self.mox.ReplayAll()
        discover(self.source, self.activity)
  def test_no_h_entries(self):
    """Make sure nothing bad happens when fetching a feed without
    h-entries
    """
    activity = self.activities[0]
    activity['object']['content'] = 'post content without backlink'
    activity['object']['url'] = 'https://fa.ke/post/url'

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <p>under construction</p>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    self.assert_equals(
      [(None, 'https://fa.ke/post/url')],
      [(relationship.original, relationship.syndication)
       for relationship in SyndicatedPost.query(ancestor=source.key)])
 def test_do_not_fetch_hfeed(self):
   """Confirms behavior of discover() when fetch_hfeed=False.
   Discovery should only check the database for previously discovered matches.
   It should not make any GET requests
   """
   discover(self.source, self.activity, fetch_hfeed=False)
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
  def test_rel_feed_link(self):
    """Check that we follow the rel=feed link when looking for the
    author's full feed URL
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" type="text/html" href="try_this.html">
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
    </html>""")

    self.expect_requests_get('http://author/try_this.html', """
    <html class="h-feed">
      <body>
        <div class="h-entry">Hi</div>
      </body>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)
  def test_refetch_multiple_responses_same_activity(self):
    """Ensure that refetching a post that has several replies does not
    generate duplicate original -> None blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for
    details
    """
    for activity in self.activities:
        activity['object']['content'] = 'post content without backlinks'
        activity['object']['url'] = 'https://fa.ke/post/url'

    author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>"""

    author_entry = """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </html>"""

    # original
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    # refetch
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    self.mox.ReplayAll()

    for activity in self.activities:
      discover(self.source, activity)
    refetch(self.source)
    self.assert_syndicated_posts(('http://author/post/permalink', None),
                                 (None, 'https://fa.ke/post/url'))
  def test_feed_head_request_failed(self):
    """Confirm that we don't follow rel=feeds explicitly marked as
    application/xml.
    """
    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" href="/updates">
      </head>
      <body>
        <article class="h-entry">
          <a class="u-url" href="permalink"></a>
        </article>
      </body>
    </html>
    """)

    # head request to follow redirects on the post url
    self.expect_requests_head(self.activity['object']['url'])

    # and for the author url
    self.expect_requests_head('http://author')

    # try and fail to get the feed
    self.expect_requests_head('http://author/updates', status_code=400)
    self.expect_requests_get('http://author/updates', status_code=400)

    # fall back on the original page, and fetch the post permalink
    self.expect_requests_head('http://author/permalink')
    self.expect_requests_get('http://author/permalink', '<html></html>')

    self.mox.ReplayAll()
    discover(self.source, self.activity)
  def test_rel_feed_link_error(self):
    """Author page has an h-feed link that raises an exception. We should
    recover and use the main page's h-entries as a fallback."""
    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" type="text/html" href="try_this.html">
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
      <body>
        <div class="h-entry">
          <a class="u-url" href="recover_and_fetch_this.html"></a>
        </div>
      </body>
    </html>""")

    # try to do this and fail
    self.expect_requests_get('http://author/try_this.html', 'nope',
                             status_code=404)

    # despite the error, should fallback on the main page's h-entries and
    # check the permalink
    self.expect_requests_get('http://author/recover_and_fetch_this.html', 'ok')

    self.mox.ReplayAll()
    discover(self.source, self.activity)
  def test_rel_feed_anchor(self):
    """Check that we follow the rel=feed when it's in an <a> tag instead of <link>
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="alternate" type="application/xml" href="not_this.html">
        <link rel="alternate" type="application/xml" href="nor_this.html">
      </head>
      <body>
        <a href="try_this.html" rel="feed">full unfiltered feed</a>
      </body>
    </html>""")

    self.expect_requests_get('http://author/try_this.html', """
    <html class="h-feed">
      <body>
        <div class="h-entry">Hi</div>
      </body>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)
  def _test_failed_post_permalink_fetch(self, raise_exception):
    """Make sure something reasonable happens when we're unable to fetch
    the permalink of an entry linked in the h-feed
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="nonexistent.html"></a>
      </article>
    </html>
    """)

    if raise_exception:
      self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError())
    else:
      self.expect_requests_get('http://author/nonexistent.html', status_code=410)

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # we should have saved placeholders to prevent us from trying the
    # syndication url or permalink again
    self.assert_equals(
      set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]),
      set((relationship.original, relationship.syndication)
          for relationship in SyndicatedPost.query(ancestor=source.key)))
  def test_syndication_url_in_hfeed(self):
    """Like test_single_post, but because the syndication URL is given in
    the h-feed we skip fetching the permalink. New behavior as of
    2014-11-08
    """
    self.activity['object']['upstreamDuplicates'] = ['existing uD']

    # silo domain is fa.ke
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/post/url">
      </div>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', self.source, self.activity)
    original_post_discovery.discover(self.source, self.activity)

    # upstreamDuplicates = 1 original + 1 discovered
    self.assertEquals(['existing uD', 'http://author/post/permalink'],
                      self.activity['object']['upstreamDuplicates'])

    origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)]
    self.assertEquals([u'http://author/post/permalink'], origurls)

    # for now only syndicated posts belonging to this source are stored
    syndurls = list(r.syndication for r
                    in SyndicatedPost.query(ancestor=self.source.key))

    self.assertEquals([u'https://fa.ke/post/url'], syndurls)
 def test_no_author_url(self):
   """Make sure something reasonable happens when the author doesn't have
   a url at all.
   """
   self.source.domain_urls = []
   discover(self.source, self.activity)
   # nothing attempted, and no SyndicatedPost saved
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
  def test_multiple_refetches(self):
    """Ensure that multiple refetches of the same post (with and without
    u-syndication) does not generate duplicate blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for details
    """
    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""

    unsyndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    </html>"""

    syndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>"""

    # first attempt, no syndication url yet
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # refetch, still no syndication url
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # second refetch, has a syndication url this time
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', syndicated)

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activities[0])
    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertIsNone(relations[0].syndication)

    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
  def test_merge_front_page_and_h_feed(self):
    """Make sure we are correctly merging the front page and rel-feed by
    checking that we visit h-entries that are only the front page or
    only the rel-feed page.
    """
    activity = self.activities[0]
    activity['object'].update({
        'content': 'post content without backlink',
        'url': 'https://fa.ke/post/url',
        'upstreamDuplicates': ['existing uD'],
    })

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-frontpage"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
    </html>""")

    self.expect_requests_get('http://author/feed', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-feed"></a>
      </div>
    </html>""")

    for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'):
      self.expect_requests_get('http://author%s' % orig,
                               """<div class="h-entry">
                                 <a class="u-url" href="%s"></a>
                               </div>""" % orig).InAnyOrder()

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    # should be three blank SyndicatedPosts now
    for orig in ('http://author/only-on-frontpage',
                 'http://author/on-both',
                 'http://author/only-on-feed'):
      logging.debug('checking %s', orig)
      sp = SyndicatedPost.query(
        SyndicatedPost.original == orig,
        ancestor=source.key).get()
      self.assertTrue(sp)
      self.assertIsNone(sp.syndication)
    def test_feed_type_unknown(self):
        """Confirm that we look for an h-feed with type=text/html even when
    the type is not given in <link>, and keep looking until we find one.
    """
        self.expect_requests_get(
            "http://author",
            """
    <html>
      <head>
        <link rel="feed" href="/updates.atom">
        <link rel="feed" href="/updates.html">
        <link rel="feed" href="/updates.rss">
      </head>
    </html>""",
        )

        # head request to follow redirects on the post url
        self.expect_requests_head(self.activity["object"]["url"])

        # and for the author url
        self.expect_requests_head("http://author")

        # try to get the atom feed first
        self.expect_requests_head("http://author/updates.atom", content_type="application/xml")

        # keep looking for an html feed
        self.expect_requests_head("http://author/updates.html")

        # look at the rss feed last
        self.expect_requests_head("http://author/updates.rss", content_type="application/xml")

        # now fetch the html feed
        self.expect_requests_get(
            "http://author/updates.html",
            """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/permalink">should follow this</a>
      </article>
    </html>""",
        )

        # should not try to get the rss feed at this point
        # but we will follow the post permalink

        # keep looking for an html feed
        self.expect_requests_head("http://author/permalink")
        self.expect_requests_get(
            "http://author/permalink",
            """
    <html class="h-entry">
      <p class="p-name">Title</p>
    </html>""",
        )

        self.mox.ReplayAll()
        discover(self.source, self.activity)
    def test_feed_type_unknown(self):
        """Confirm that we look for an h-feed with type=text/html even when
    the type is not given in <link>, and keep looking until we find one.
    """
        source = self.sources[0]
        source.domain_urls = ['http://author']
        activity = self.activities[0]
        activity['object']['url'] = 'http://fa.ke/post/url'
        activity['object']['content'] = 'content without links'

        self.mox.StubOutWithMock(requests, 'head', use_mock_anything=True)

        self.expect_requests_get(
            'http://author', """
    <html>
      <head>
        <link rel="feed" href="/updates.atom">
        <link rel="feed" href="/updates.html">
        <link rel="feed" href="/updates.rss">
      </head>
    </html>""")

        # head request to follow redirects on the post url
        self.expect_requests_head(activity['object']['url'])

        # and for the author url
        self.expect_requests_head('http://author')

        # try to get the atom feed first
        self.expect_requests_head(
            'http://author/updates.atom', content_type='application/xml')

        # keep looking for an html feed
        self.expect_requests_head('http://author/updates.html')

        # now fetch the html feed
        self.expect_requests_get(
            'http://author/updates.html', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/permalink">should follow this</a>
      </article>
    </html>""")

        # should not try to get the rss feed at this point
        # but we will follow the post permalink

        # keep looking for an html feed
        self.expect_requests_head('http://author/permalink')
        self.expect_requests_get(
            'http://author/permalink', """
    <html class="h-entry">
      <p class="p-name">Title</p>
    </html>""")

        self.mox.ReplayAll()
        original_post_discovery.discover(source, activity)
  def test_multiple_rel_feeds(self):
    """Make sure that we follow all rel=feed links, e.g. if notes and
    articles are in separate feeds."""

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" href="/articles" type="text/html">
        <link rel="feed" href="/notes" type="text/html">
      </head>
    </html>""")

    # fetches all feeds first
    self.expect_requests_get('http://author/articles', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/article-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    self.expect_requests_get('http://author/notes', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/note-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    # then the permalinks (in any order since they are hashed to
    # remove duplicates)
    self.expect_requests_get('http://author/article-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/article-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/article"></a>
    </html>""").InAnyOrder('permalink')

    self.expect_requests_get('http://author/note-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/note-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/note"></a>
    </html>""").InAnyOrder('permalink')

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activity)

    note_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/note-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(note_rels))
    self.assertEqual('https://fa.ke/note', note_rels[0].syndication)

    article_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/article-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(article_rels))
    self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
    def test_avoid_author_page_with_bad_content_type(self):
        """Confirm that we check the author page's content type before
    fetching and parsing it
    """
        # head request to follow redirects on the post url
        self.expect_requests_head(self.activity["object"]["url"])
        self.expect_requests_head("http://author", response_headers={"content-type": "application/xml"})

        # give up
        self.mox.ReplayAll()
        discover(self.source, self.activity)
 def test_invalid_webmention_target(self):
   """Confirm that no additional requests are made if the author url is
   an invalid webmention target. Right now this pretty much just
   means they're on the blacklist. Eventually we want to filter out
   targets that don't have certain features, like a webmention
   endpoint or microformats.
   """
   self.source.domain_urls = ['http://amazon.com']
   discover(self.source, self.activity)
   # nothing attempted, but we should have saved a placeholder to prevent us
   # from trying again
   self.assert_syndicated_posts((None, 'https://fa.ke/post/url'))
  def test_no_h_entries(self):
    """Make sure nothing bad happens when fetching a feed without h-entries.
    """
    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <p>under construction</p>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', self.source, self.activity)
    original_post_discovery.discover(self.source, self.activity)
    self.assert_syndicated_posts((None, 'https://fa.ke/post/url'))
Exemple #29
0
  def add_original_post_urls(self, post_id, obj, prop):
    """Extracts original post URLs and adds them to an object, in place.

    If the post object has upstreamDuplicates, *only* they are considered
    original post URLs and added as tags with objectType 'article', and the
    post's own links and 'article' tags are added with objectType 'mention'.

    Args:
      post_id: string post id
      obj: ActivityStreams post object
      prop: string property name in obj to add the original post URLs to
    """
    post = None
    try:
      post = self.source.get_post(post_id)
    except:
      logging.warning('Error fetching source post %s', post_id, exc_info=True)
      return
    if not post:
      logging.warning('Source post %s not found', post_id)
      return

    original_post_discovery.discover(self.source, post, fetch_hfeed=False)
    tags = [tag for tag in post['object'].get('tags', [])
            if 'url' in tag and tag['objectType'] == 'article']
    upstreams = post['object'].get('upstreamDuplicates', [])

    if not isinstance(obj.setdefault(prop, []), list):
      obj[prop] = [obj[prop]]
    if upstreams:
      obj[prop] += [{'url': url, 'objectType': 'article'} for url in upstreams]
      obj.setdefault('tags', []).extend(
        [{'url': tag.get('url'), 'objectType': 'mention'} for tag in tags])
    else:
      obj[prop] += tags

    # check for redirects, and if there are any follow them and add final urls
    # in addition to the initial urls.
    seen = set()
    for url_list in obj[prop], obj.get('tags', []):
      for url_obj in url_list:
        url = util.clean_webmention_url(url_obj.get('url', ''))
        if not url or url in seen:
          continue
        seen.add(url)
        # when debugging locally, replace my (snarfed.org) URLs with localhost
        url_obj['url'] = url = util.replace_test_domains_with_localhost(url)
        resolved, _, send = util.get_webmention_target(url)
        if send and resolved != url and resolved not in seen:
          seen.add(resolved)
          url_list.append({'url': resolved, 'objectType': url_obj.get('objectType')})

    logging.info('After original post discovery, urls are: %s', seen)
  def test_rel_feed_adds_to_domains(self):
    """rel=feed discovery should update Source.domains."""
    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" type="text/html" href="http://other/domain">
      </head>
    </html>""")
    self.expect_requests_get('http://other/domain', 'foo')
    self.mox.ReplayAll()

    discover(self.source, self.activity)
    self.assertEquals(['author', 'other'], self.source.updates['domains'])
Exemple #31
0
def retry():
    entity = util.load_source()
    if not isinstance(entity, Webmentions):
        error(f'Unexpected key kind {entity.key.kind()}')

    source = entity.source.get()

    # run OPD to pick up any new SyndicatedPosts. note that we don't refetch
    # their h-feed, so if they've added a syndication URL since we last crawled,
    # retry won't make us pick it up. background in #524.
    if entity.key.kind() == 'Response':
        source = entity.source.get()
        for activity in [json_loads(a) for a in entity.activities_json]:
            originals, mentions = original_post_discovery.discover(
                source,
                activity,
                fetch_hfeed=False,
                include_redirect_sources=False)
            entity.unsent += original_post_discovery.targets_for_response(
                json_loads(entity.response_json),
                originals=originals,
                mentions=mentions)

    entity.restart()
    flash('Retrying. Refresh in a minute to see the results!')
    return redirect(request.values.get('redirect_to') or source.bridgy_url())
Exemple #32
0
    def post(self):
        entity = self.load_source(param='key')
        if not isinstance(entity, Webmentions):
            self.abort(400, 'Unexpected key kind %s', entity.key.kind())

        # run OPD to pick up any new SyndicatedPosts. note that we don't refetch
        # their h-feed, so if they've added a syndication URL since we last crawled,
        # retry won't make us pick it up. background in #524.
        if entity.key.kind() == 'Response':
            source = entity.source.get()
            for activity in [json_loads(a) for a in entity.activities_json]:
                originals, mentions = original_post_discovery.discover(
                    source,
                    activity,
                    fetch_hfeed=False,
                    include_redirect_sources=False)
                entity.unsent += original_post_discovery.targets_for_response(
                    json_loads(entity.response_json),
                    originals=originals,
                    mentions=mentions)

        entity.restart()
        self.messages.add('Retrying. Refresh in a minute to see the results!')
        self.redirect(
            self.request.get('redirect_to')
            or entity.source.get().bridgy_url(self))
Exemple #33
0
 def get_item(self, post_id, user_id, reaction_id):
   post = self.get_post(post_id)
   reaction = self.source.gr_source.get_reaction(
     self.source.key_id(), post_id, user_id, reaction_id, activity=post)
   if post:
     originals, mentions = original_post_discovery.discover(
       self.source, post, fetch_hfeed=False)
     self.merge_urls(reaction, 'object', originals)
   return reaction
Exemple #34
0
 def get_item(self, event_id, user_id):
   event = self.source.gr_source.get_event(event_id)
   rsvp = self.source.gr_source.get_rsvp(
     self.source.key_id(), event_id, user_id, event=event)
   if event:
     originals, mentions = original_post_discovery.discover(
       self.source, event, fetch_hfeed=False)
     self.merge_urls(rsvp, 'inReplyTo', originals)
   return rsvp
Exemple #35
0
 def get_item(self, post_id, user_id):
   post = self.get_post(post_id, fetch_likes=True)
   like = self.source.get_like(self.source.key_id(), post_id, user_id,
                               activity=post)
   if post:
     originals, mentions = original_post_discovery.discover(
       self.source, post, fetch_hfeed=False)
     self.merge_urls(like, 'object', originals)
   return like
Exemple #36
0
 def get_item(self, post_id, id):
   post = self.get_post(post_id, fetch_replies=True)
   cmt = self.source.get_comment(
     id, activity_id=post_id, activity_author_id=self.source.key_id(),
     activity=post)
   if post:
     originals, mentions = original_post_discovery.discover(
       self.source, post, fetch_hfeed=False)
     self.merge_urls(cmt, 'inReplyTo', originals)
     self.merge_urls(cmt, 'tags', mentions, object_type='mention')
   return cmt
Exemple #37
0
  def get_item(self, id):
    posts = self.source.get_activities(activity_id=id, user_id=self.source.key_id())
    if not posts:
      return None

    post = posts[0]
    originals, mentions = original_post_discovery.discover(
      self.source, post, fetch_hfeed=False)
    obj = post['object']
    obj['upstreamDuplicates'] = list(
      set(util.get_list(obj, 'upstreamDuplicates')) | originals)
    self.merge_urls(obj, 'tags', mentions, object_type='mention')
    return obj
Exemple #38
0
 def get_item(self, post_id, share_id):
   post = self.get_post(post_id, fetch_shares=True)
   repost = self.source.gr_source.get_share(
     self.source.key_id(), post_id, share_id, activity=post)
   # webmention receivers don't want to see their own post in their
   # comments, so remove attachments before rendering.
   if repost and 'attachments' in repost:
     del repost['attachments']
   if post:
     originals, mentions = original_post_discovery.discover(
       self.source, post, fetch_hfeed=False)
     self.merge_urls(repost, 'object', originals)
   return repost
Exemple #39
0
 def get_item(self, post_id, id):
     fetch_replies = not self.source.gr_source.OPTIMIZED_COMMENTS
     post = self.get_post(post_id, fetch_replies=fetch_replies)
     cmt = self.source.get_comment(id,
                                   activity_id=post_id,
                                   activity_author_id=self.source.key_id(),
                                   activity=post if fetch_replies else None)
     if post:
         originals, mentions = original_post_discovery.discover(
             self.source, post, fetch_hfeed=False)
         self.merge_urls(cmt, 'inReplyTo', originals)
         self.merge_urls(cmt, 'tags', mentions, object_type='mention')
     return cmt
Exemple #40
0
  def post(self):
    entity = ndb.Key(urlsafe=util.get_required_param(self, 'key')).get()
    if not entity:
      self.abort(400, 'key not found')

    # start all target URLs over
    if entity.status == 'complete':
      entity.status = 'new'

    targets = set(entity.unsent + entity.sent + entity.skipped + entity.error +
                  entity.failed)
    entity.sent = entity.skipped = entity.error = entity.failed = []

    # run OPD to pick up any new SyndicatedPosts. note that we don't refetch
    # their h-feed, so if they've added a syndication URL since we last crawled,
    # retry won't make us pick it up. background in #524.
    if entity.key.kind() == 'Response':
      source = entity.source.get()
      for activity in [json.loads(a) for a in entity.activities_json]:
        originals, mentions = original_post_discovery.discover(
          source, activity, fetch_hfeed=False, include_redirect_sources=False)
        targets |= original_post_discovery.targets_for_response(
          json.loads(entity.response_json), originals=originals, mentions=mentions)

    entity.unsent = targets
    entity.put()

    # clear any cached webmention endpoints
    memcache.delete_multi(util.webmention_endpoint_cache_key(url) for url in targets)

    if entity.key.kind() == 'Response':
      util.add_propagate_task(entity)
    elif entity.key.kind() == 'BlogPost':
      util.add_propagate_blogpost_task(entity)
    else:
      self.abort(400, 'Unexpected key kind %s', entity.key.kind())

    self.messages.add('Retrying. Refresh in a minute to see the results!')
    self.redirect(self.request.get('redirect_to').encode('utf-8') or
                  entity.source.get().bridgy_url(self))
Exemple #41
0
    def receive(self, email):
        addr = self.request.path.split('/')[-1]
        message_id = email.original.get('message-id').strip('<>')
        sender = getattr(email, 'sender', None)
        to = getattr(email, 'to', None)
        cc = getattr(email, 'cc', None)
        subject = getattr(email, 'subject', None)
        logging.info('Received %s from %s to %s (%s) cc %s: %s', message_id,
                     sender, to, addr, cc, subject)

        addr = self.request.path.split('/')[-1]
        user = addr.split('@')[0]
        source = FacebookEmailAccount.query(
            FacebookEmailAccount.email_user == user).get()
        logging.info('Source for %s is %s', user, source)

        util.email_me(subject='New email from %s: %s' % (sender, subject),
                      body='Source: %s' %
                      (source.bridgy_url(self) if source else None))

        htmls = list(body.decode() for _, body in email.bodies('text/html'))
        fbe = FacebookEmail.get_or_insert(
            message_id, source=source.key if source else None, htmls=htmls)
        logging.info('FacebookEmail created %s: %s', fbe.created,
                     fbe.key.urlsafe())

        if not source:
            self.response.status_code = 404
            self.response.write(
                'No Facebook email user found with address %s' % addr)
            return

        for html in htmls:
            obj = gr_facebook.Facebook.email_to_object(html)
            if obj:
                break
        else:
            self.response.status_code = 400
            self.response.write('No HTML body could be parsed')
            return
        logging.info('Converted to AS1: %s', json.dumps(obj, indent=2))

        base_obj = source.gr_source.base_object(obj)
        # note that this ignores the id query param (the post's user id) and uses
        # the source object's user id instead.
        base_obj['url'] = source.canonicalize_url(base_obj['url'])
        # also note that base_obj['id'] is not a tag URI, it's the raw Facebook post
        # id, eg '104790764108207'. we don't use it from activities_json much,
        # though, just in PropagateResponse.source_url(), which handles this fine.

        original_post_discovery.refetch(source)
        targets, mentions = original_post_discovery.discover(source,
                                                             base_obj,
                                                             fetch_hfeed=False)
        logging.info('Got targets %s mentions %s', targets, mentions)

        resp = Response(id=obj['id'],
                        source=source.key,
                        type=Response.get_type(obj),
                        response_json=json.dumps(obj),
                        activities_json=[json.dumps(base_obj)],
                        unsent=targets)
        resp.get_or_save(source, restart=True)

        fbe.response = resp.key
        fbe.put()
Exemple #42
0
    def backfeed(self, source, responses=None, activities=None):
        """Processes responses and activities and generates propagate tasks.

    Stores property names and values to update in source.updates.

    Args:
      source: Source
      responses: dict mapping AS response id to AS object
      activities: dict mapping AS activity id to AS object
    """
        if responses is None:
            responses = {}
        if activities is None:
            activities = {}

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get(
                    'author',
                {}).get('id') != user_id and activity.get('verb') != 'share':
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json_dumps(resp, indent=2))
                    continue

                if source.is_blocked(resp):
                    logging.info(
                        'Skipping response by blocked user: %s',
                        json_dumps(resp.get('author') or resp.get('actor'),
                                   indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json_loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        source.blocked_ids = None

        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                    # new response to propagate! load block list if we haven't already
                    if source.blocked_ids is None:
                        source.load_blocklist()

                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.info(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json_dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json_dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json_dumps(urls_to_activity)
            resp_entity.get_or_save(source,
                                    restart=self.RESTART_EXISTING_TASKS)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json_dumps(
                pruned_responses + unchanged_responses)
Exemple #43
0
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise