def test_refetch_hfeed(self):
        """refetch should grab resources again, even if they were previously
    marked with a blank SyndicatedPost
    """
        # refetch 1 and 3 to see if they've been updated, 2 has already
        # been resolved for this source
        SyndicatedPost(parent=self.source.key, original="http://author/permalink1", syndication=None).put()

        SyndicatedPost(
            parent=self.source.key, original="http://author/permalink2", syndication="https://fa.ke/post/url2"
        ).put()

        SyndicatedPost(parent=self.source.key, original="http://author/permalink3", syndication=None).put()

        self.expect_requests_get(
            "http://author",
            """
      <html class="h-feed">
        <a class="h-entry" href="/permalink1"></a>
        <a class="h-entry" href="/permalink2"></a>
        <a class="h-entry" href="/permalink3"></a>
      </html>""",
        )

        # yay, permalink1 has an updated syndication url
        self.expect_requests_get(
            "http://author/permalink1",
            """
      <html class="h-entry">
        <a class="u-url" href="/permalink1"></a>
        <a class="u-syndication" href="https://fa.ke/post/url1"></a>
      </html>""",
        ).InAnyOrder()

        # permalink2 hasn't changed since we first checked it
        self.expect_requests_get(
            "http://author/permalink2",
            """
      <html class="h-entry">
        <a class="u-url" href="/permalink2"></a>
        <a class="u-syndication" href="https://fa.ke/post/url2"></a>
      </html>""",
        ).InAnyOrder()

        # permalink3 hasn't changed since we first checked it
        self.expect_requests_get(
            "http://author/permalink3",
            """
      <html class="h-entry">
        <a class="u-url" href="/permalink3"></a>
      </html>""",
        ).InAnyOrder()

        self.mox.ReplayAll()
        refetch(self.source)
        self.assert_syndicated_posts(
            ("http://author/permalink1", "https://fa.ke/post/url1"),
            ("http://author/permalink2", "https://fa.ke/post/url2"),
            ("http://author/permalink3", None),
        )
  def test_refetch_multiple_responses_same_activity(self):
    """Ensure that refetching a post that has several replies does not
    generate duplicate original -> None blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for
    details
    """
    for activity in self.activities:
        activity['object']['content'] = 'post content without backlinks'
        activity['object']['url'] = 'https://fa.ke/post/url'

    author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>"""

    author_entry = """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </html>"""

    # original
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    # refetch
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    self.mox.ReplayAll()

    for activity in self.activities:
      discover(self.source, activity)
    refetch(self.source)
    self.assert_syndicated_posts(('http://author/post/permalink', None),
                                 (None, 'https://fa.ke/post/url'))
  def test_multiple_refetches(self):
    """Ensure that multiple refetches of the same post (with and without
    u-syndication) does not generate duplicate blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for details
    """
    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""

    unsyndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    </html>"""

    syndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>"""

    # first attempt, no syndication url yet
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # refetch, still no syndication url
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # second refetch, has a syndication url this time
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', syndicated)

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activities[0])
    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertIsNone(relations[0].syndication)

    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
  def test_refetch_multiple_responses_same_activity(self):
    """Ensure that refetching a post that has several replies does not
    generate duplicate original -> None blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for
    details
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    for activity in self.activities:
        activity['object']['content'] = 'post content without backlinks'
        activity['object']['url'] = 'https://fa.ke/post/url'

    author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>"""

    author_entry = """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </html>"""

    # original
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    # refetch
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    self.mox.ReplayAll()

    for activity in self.activities:
      original_post_discovery.discover(source, activity)

    original_post_discovery.refetch(source)

    rels_by_original = list(
      SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_original))
    self.assertIsNone(rels_by_original[0].syndication)

    rels_by_syndication = list(
      SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_syndication))
    self.assertIsNone(rels_by_syndication[0].original)
    def test_refetch_deleted_syndication(self):
        """Deleted syndication links that have disappeared since our last fetch."""
        SyndicatedPost(
            parent=self.source.key, original="http://author/permalink", syndication="https://fa.ke/post/url"
        ).put()
        self.expect_requests_get(
            "http://author",
            """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </div>
    </html>""",
        )
        self.expect_requests_get(
            "http://author/permalink",
            """
      <html class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </html>""",
        )

        self.mox.ReplayAll()
        self.assert_equals({}, refetch(self.source))
        self.assert_syndicated_posts(("http://author/permalink", None))
 def test_refetch_multiple_domain_urls(self):
   """We should refetch all of a source's URLs."""
   self._expect_multiple_domain_url_fetches()
   result = refetch(self.source)
   self.assert_equals(['https://fa.ke/A' ,'https://fa.ke/B'], result.keys())
   self.assert_syndicated_posts(('http://author1/A', 'https://fa.ke/A'),
                                ('http://author3/B', 'https://fa.ke/B'))
  def test_refetch_unchanged_syndication(self):
    """We should preserve unchanged SyndicatedPosts during refetches."""
    synd = SyndicatedPost(parent=self.source.key,
                          original='http://author/permalink',
                          syndication='https://fa.ke/post/url')
    synd.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    refetch(self.source)
    self.assert_entities_equal([synd], list(SyndicatedPost.query()))
Exemple #8
0
  def refetch_hfeed(self, source):
    """refetch and reprocess the author's url, looking for
    new or updated syndication urls that we may have missed the first
    time we looked for them.
    """
    logging.debug('refetching h-feed for source %s', source.label())
    relationships = original_post_discovery.refetch(source)
    if not relationships:
      return

    logging.debug('refetch h-feed found %d new rel=syndication relationships',
                  len(relationships))

    # grab the Responses and see if any of them have a a syndication
    # url matching one of the newly discovered relationships. We'll
    # check each response until we've seen all of them or until
    # the 60s timer runs out.
    # TODO maybe add a (canonicalized) url field to Response so we can
    # query by it instead of iterating over all of them
    for response in (Response.query(Response.source == source.key)
                     .order(-Response.created)):
      if response.activity_json:  # handle old entities
        response.activities_json.append(response.activity_json)
        response.activity_json = None

      new_orig_urls = set()
      for activity_json in response.activities_json:
        activity = json.loads(activity_json)
        activity_url = activity.get('url') or activity.get('object', {}).get('url')
        if not activity_url:
          logging.warning('activity has no url %s', activity_json)
          continue

        activity_url = source.canonicalize_syndication_url(activity_url)
        # look for activity url in the newly discovered list of relationships
        for relationship in relationships.get(activity_url, []):
          # won't re-propagate if the discovered link is already among
          # these well-known upstream duplicates
          if relationship.original in response.sent:
            logging.info(
              '%s found a new rel=syndication link %s -> %s, but the '
              'relationship had already been discovered by another method',
              response.label(), relationship.original,
              relationship.syndication)
          else:
            logging.info(
              '%s found a new rel=syndication link %s -> %s, and '
              'will be repropagated with a new target!',
              response.label(), relationship.original,
              relationship.syndication)
            new_orig_urls.add(relationship.original)

      if new_orig_urls:
        # re-open a previously 'complete' propagate task
        response.status = 'new'
        response.unsent.extend(list(new_orig_urls))
        response.put()
        response.add_task()
  def test_multiple_refetches(self):
    """Ensure that multiple refetches of the same post (with and without
    u-syndication) does not generate duplicate blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for details
    """
    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""

    unsyndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    </html>"""

    syndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>"""

    # first attempt, no syndication url yet
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # refetch, still no syndication url
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # second refetch, has a syndication url this time
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', syndicated)

    self.mox.ReplayAll()
    discover(self.source, self.activities[0])
    refetch(self.source)
    self.assert_syndicated_posts(('http://author/permalink', None),
                                 (None, u'https://fa.ke/post/url'))

    refetch(self.source)
    self.assert_syndicated_posts(('http://author/permalink', 'https://fa.ke/post/url'))
  def test_refetch_two_permalinks_same_syndication(self):
    """
    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/post1"></a>
    <a class="h-entry" href="/post2"></a>
    </html>"""

    self.expect_requests_get('http://author', hfeed)

    for i in range(2):
      self.expect_requests_get(
        'http://author/post%d' % (i + 1),
        """<html class="h-entry">
        <a class="u-url" href="/post%d"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
        </html>""" % (i + 1))

    # refetch should only grab the feed
    self.expect_requests_get('http://author', hfeed)

    self.mox.ReplayAll()
    activity = original_post_discovery.discover(source, self.activities[0])
    self.assertItemsEqual(['http://author/post1', 'http://author/post2'],
                          activity['object'].get('upstreamDuplicates'))

    relations = SyndicatedPost.query(ancestor=source.key).fetch()
    self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'),
                           ('http://author/post2', 'https://fa.ke/post/url')],
                          [(relation.original, relation.syndication)
                           for relation in relations])

    # discover should have already handled all relationships, refetch should
    # not find anything
    refetch_result = original_post_discovery.refetch(source)
    self.assertFalse(refetch_result)
    def test_refetch_two_permalinks_same_syndication(self):
        """
    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
        self.activities[0]["object"].update(
            {"content": "post content without backlinks", "url": "https://fa.ke/post/url"}
        )

        hfeed = """<html class="h-feed">
    <a class="h-entry" href="/post1"></a>
    <a class="h-entry" href="/post2"></a>
    </html>"""

        hentries = [
            (
                "http://author/post%d" % (i + 1),
                """<html class="h-entry">
       <a class="u-url" href="/post%d"></a>
       <a class="u-syndication" href="https://fa.ke/post/url"></a>
       </html>"""
                % (i + 1),
            )
            for i in range(2)
        ]

        self.expect_requests_get("http://author", hfeed)
        for permalink, content in hentries:
            self.expect_requests_get(permalink, content)

        # refetch
        self.expect_requests_get("http://author", hfeed)
        for permalink, content in hentries:
            self.expect_requests_get(permalink, content)

        self.mox.ReplayAll()
        self.assert_discover(["http://author/post1", "http://author/post2"])
        self.assert_syndicated_posts(
            ("http://author/post1", "https://fa.ke/post/url"), ("http://author/post2", "https://fa.ke/post/url")
        )

        # discover should have already handled all relationships, refetch should
        # not find anything
        self.assertFalse(refetch(self.source))
  def test_refetch_permalink_with_two_syndications(self):
    """Test one permalink with two syndicated posts. Make sure that
    refetch doesn't have a problem with two entries for the same
    original URL.
    """
    for idx, activity in enumerate(self.activities):
      activity['object'].update({
        'content': 'post content without backlinks',
        'url': 'https://fa.ke/post/url%d' % (idx + 1),
      })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""
    hentry = """<html class="h-entry">
    <a class="u-url" href="/permalink"/>
    <a class="u-syndication" href="https://fa.ke/post/url1"/>
    <a class="u-syndication" href="https://fa.ke/post/url3"/>
    <a class="u-syndication" href="https://fa.ke/post/url5"/>
    </html>"""

    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', hentry)

    # refetch
    self.expect_requests_get('http://author', hfeed)
    # refetch grabs posts that it's seen before in case there have
    # been updates
    self.expect_requests_get('http://author/permalink', hentry)

    self.mox.ReplayAll()

    original_post_discovery.discover(self.source, self.activities[0])
    relations = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink',
      ancestor=self.source.key).fetch()
    self.assertItemsEqual(
      [('http://author/permalink', 'https://fa.ke/post/url1'),
       ('http://author/permalink', 'https://fa.ke/post/url3'),
       ('http://author/permalink', 'https://fa.ke/post/url5')],
      [(r.original, r.syndication) for r in relations])

    results = original_post_discovery.refetch(self.source)
    self.assertFalse(results)
  def test_refetch_changed_syndication(self):
    """Update syndication links that have changed since our last fetch."""
    SyndicatedPost(parent=self.source.key,
                   original='http://author/permalink',
                   syndication='https://fa.ke/post/url').put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/changed/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    results = refetch(self.source)
    self.assert_syndicated_posts(
      ('http://author/permalink', 'https://fa.ke/changed/url'))
    self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())},
                       results)
  def test_refetch_blank_syndication(self):
    """We should preserve blank SyndicatedPosts during refetches."""
    blank = SyndicatedPost(parent=self.source.key,
                           original='http://author/permalink',
                           syndication=None)
    blank.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </div>
    </html>""")
    self.expect_requests_get('http://author/permalink', """
      <html class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </html>""")

    self.mox.ReplayAll()
    self.assert_equals({}, refetch(self.source))
    self.assert_syndicated_posts(('http://author/permalink', None))
  def test_refetch_permalink_with_two_syndications(self):
    """Test one permalink with two syndicated posts. Make sure that
    refetch doesn't have a problem with two entries for the same
    original URL.
    """
    for idx, activity in enumerate(self.activities):
      activity['object'].update({
        'content': 'post content without backlinks',
        'url': 'https://fa.ke/post/url%d' % (idx + 1),
      })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""
    hentry = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url1"></a>
    <a class="u-syndication" href="https://fa.ke/post/url3"></a>
    <a class="u-syndication" href="https://fa.ke/post/url5"></a>
    </html>"""

    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', hentry)

    # refetch
    self.expect_requests_get('http://author', hfeed)
    # refetch grabs posts that it's seen before in case there have been updates
    self.expect_requests_get('http://author/permalink', hentry)

    self.mox.ReplayAll()
    discover(self.source, self.activities[0])
    self.assert_syndicated_posts(
      ('http://author/permalink', 'https://fa.ke/post/url1'),
      ('http://author/permalink', 'https://fa.ke/post/url3'),
      ('http://author/permalink', 'https://fa.ke/post/url5'))
    self.assertFalse(refetch(self.source))
  def test_refetch_hfeed(self):
    """refetch should grab resources again, even if they were previously
    marked with a blank SyndicatedPost
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    # refetch 1 and 3 to see if they've been updated, 2 has already
    # been resolved for this source
    SyndicatedPost(parent=source.key,
                   original='http://author/permalink1',
                   syndication=None).put()

    SyndicatedPost(parent=source.key,
                   original='http://author/permalink2',
                   syndication='https://fa.ke/post/url2').put()

    SyndicatedPost(parent=source.key,
                   original='http://author/permalink3',
                   syndication=None).put()

    self.expect_requests_get('http://author', """
      <html class="h-feed">
        <a class="h-entry" href="/permalink1"></a>
        <a class="h-entry" href="/permalink2"></a>
        <a class="h-entry" href="/permalink3"></a>
      </html>""")

    # yay, permalink1 has an updated syndication url
    self.expect_requests_get('http://author/permalink1', """
      <html class="h-entry">
        <a class="u-url" href="/permalink1"></a>
        <a class="u-syndication" href="https://fa.ke/post/url1"></a>
      </html>""").InAnyOrder()

    # permalink3 hasn't changed since we first checked it
    self.expect_requests_get('http://author/permalink3', """
      <html class="h-entry">
        <a class="u-url" href="/permalink3"></a>
      </html>""").InAnyOrder()

    self.mox.ReplayAll()
    original_post_discovery.refetch(source)

    relationships1 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink1',
      ancestor=source.key).fetch()

    self.assertTrue(relationships1)
    self.assertEquals('https://fa.ke/post/url1', relationships1[0].syndication)

    relationships2 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink2',
      ancestor=source.key).fetch()

    # this shouldn't have changed
    self.assertTrue(relationships2)
    self.assertEquals('https://fa.ke/post/url2', relationships2[0].syndication)

    relationships3 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink3',
      ancestor=source.key).fetch()

    self.assertTrue(relationships3)
    self.assertIsNone(relationships3[0].syndication)
  def test_refetch_with_updated_permalink(self):
    """Permalinks can change (e.g., if a stub is added or modified).

    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    # first attempt, no stub yet
    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09"></a>
    </html>""")
    self.expect_requests_get('http://author/2014/08/09', """
    <html class="h-entry">
    <a class="u-url" href="/2014/08/09"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>""")

    # refetch, permalink has a stub now
    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a>
    </html>""")

    self.expect_requests_get('http://author/2014/08/09/this-is-a-stub', """
    <html class="h-entry">
    <a class="u-url" href="/2014/08/09/this-is-a-stub"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>""")

    # refetch again (feed-only this time)
    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a>
    </html>""")

    self.mox.ReplayAll()
    activity = original_post_discovery.discover(source, self.activities[0])

    # modified activity should have /2014/08/09 as an upstreamDuplicate now
    self.assertEquals(['http://author/2014/08/09'],
                      activity['object']['upstreamDuplicates'])

    # refetch should find the updated original url -> syndication url.
    # it should *not* find the previously discovered relationship.
    first_results = original_post_discovery.refetch(source)
    self.assertEquals(1, len(first_results))
    new_relations = first_results.get('https://fa.ke/post/url')
    self.assertEquals(1, len(new_relations))
    self.assertEquals('https://fa.ke/post/url', new_relations[0].syndication)
    self.assertEquals('http://author/2014/08/09/this-is-a-stub',
                      new_relations[0].original)

    # second refetch should find nothing because nothing has changed
    # since the previous refetch.
    second_results = original_post_discovery.refetch(source)
    self.assertFalse(second_results)
Exemple #18
0
class Poll(webapp2.RequestHandler):
    """Task handler that fetches and processes new responses from a single source.

  Request parameters:
    source_key: string key of source entity
    last_polled: timestamp, YYYY-MM-DD-HH-MM-SS

  Inserts a propagate task for each response that hasn't been seen before.
  """
    def post(self, *path_args):
        logging.debug('Params: %s', self.request.params)

        key = self.request.params['source_key']
        source = ndb.Key(urlsafe=key).get()
        if not source or source.status == 'disabled' or 'listen' not in source.features:
            logging.error('Source not found or disabled. Dropping task.')
            return
        logging.info('Source: %s %s, %s', source.label(),
                     source.key.string_id(), source.bridgy_url(self))

        last_polled = self.request.params['last_polled']
        if last_polled != source.last_polled.strftime(
                util.POLL_TASK_DATETIME_FORMAT):
            logging.warning(
                'duplicate poll task! deferring to the other task.')
            return

        logging.info('Last poll: %s/log?start_time=%s&key=%s',
                     self.request.host_url,
                     calendar.timegm(source.last_poll_attempt.utctimetuple()),
                     source.key.urlsafe())

        # mark this source as polling
        source.updates = {
            'poll_status': 'polling',
            'last_poll_attempt': util.now_fn(),
        }
        source = models.Source.put_updates(source)

        source.updates = {}
        try:
            self.poll(source)
        except models.DisableSource:
            # the user deauthorized the bridgy app, so disable this source.
            # let the task complete successfully so that it's not retried.
            source.updates['status'] = 'disabled'
            logging.warning('Disabling source!')
        except:
            source.updates['poll_status'] = 'error'
            raise
        finally:
            source = models.Source.put_updates(source)

        # add new poll task. randomize task ETA to within +/- 20% to try to spread
        # out tasks and prevent thundering herds.
        task_countdown = source.poll_period().total_seconds() * random.uniform(
            .8, 1.2)
        util.add_poll_task(source, countdown=task_countdown)

        # feeble attempt to avoid hitting the instance memory limit
        source = None
        gc.collect()

    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        try:
            # search for links first so that the user's activities and responses
            # override them if they overlap
            links = source.search_for_links()

            # this user's own activities (and user mentions)
            resp = source.get_activities_response(
                fetch_replies=True,
                fetch_likes=True,
                fetch_shares=True,
                fetch_mentions=True,
                count=50,
                etag=source.last_activities_etag,
                min_id=source.last_activity_id,
                cache=cache)
            etag = resp.get('etag')  # used later
            user_activities = resp.get('items', [])

            # these map ids to AS objects
            responses = {a['id']: a for a in links}
            activities = {a['id']: a for a in links + user_activities}

        except Exception, e:
            code, body = util.interpret_http_exception(e)
            if code == '401':
                msg = 'Unauthorized error: %s' % e
                logging.warning(msg, exc_info=True)
                source.updates['poll_status'] = 'ok'
                raise models.DisableSource(msg)
            elif code in util.HTTP_RATE_LIMIT_CODES:
                logging.warning(
                    'Rate limited. Marking as error and finishing. %s', e)
                source.updates.update({
                    'poll_status': 'error',
                    'rate_limited': True
                })
                return
            elif (code
                  and int(code) / 100 == 5) or util.is_connection_failure(e):
                logging.error(
                    'API call failed. Marking as error and finishing. %s: %s\n%s',
                    code, body, e)
                self.abort(ERROR_HTTP_RETURN_CODE)
            else:
                raise

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json.dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else
             private)[id] = activity
        logging.info('Found %d public activities: %s', len(public),
                     public.keys())
        logging.info('Found %d private activities: %s', len(private),
                     private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls(
            [a.get('published') for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates['last_public_post'] = \
                  util.as_utc(util.parse_iso8601(max_published))

        source.updates['recent_private_posts'] = \
          len([a for a in private.values()
               if a.get('published', util.EPOCH_ISO) > last_public_post])

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get('object') or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get('author', {}).get('id') != user_id:
                for tag in obj.get('tags', []):
                    urls = tag.get('urls')
                    if tag.get('objectType') == 'person' and tag.get(
                            'id') == user_id and urls:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                        activity['mentions'].update(
                            u.get('value') for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get('attachments', []):
                if (att.get('objectType') in ('note', 'article') and att.get(
                        'author', {}).get('id') == source.user_tag_id()):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if 'originals' not in activity or 'mentions' not in activity:
                        activity['originals'], activity['mentions'] = \
                          original_post_discovery.discover(
                            source, activity, fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds)
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get('replies', {}).get('items', [])
            tags = obj.get('tags', [])
            likes = [t for t in tags if Response.get_type(t) == 'like']
            reactions = [t for t in tags if Response.get_type(t) == 'react']
            reposts = [t for t in tags if Response.get_type(t) == 'repost']
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get('id')
                if not id:
                    logging.error('Skipping response without id: %s',
                                  json.dumps(resp, indent=2))
                    continue

                resp.setdefault('activities', []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp,
                                                         existing,
                                                         log=True):
                        logging.warning(
                            'Got two different versions of same response!\n%s\n%s',
                            existing, resp)
                    resp['activities'].extend(existing.get('activities', []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen['id']
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(
                        seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop('activities', [])
            if not activities and resp_type == 'post':
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if 'originals' not in activity or 'mentions' not in activity:
                    activity['originals'], activity['mentions'] = \
                      original_post_discovery.discover(
                        source, activity, fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds)

                targets = original_post_discovery.targets_for_response(
                    resp,
                    originals=activity['originals'],
                    mentions=activity['mentions'])
                if targets:
                    logging.info('%s has %d webmention target(s): %s',
                                 activity.get('url'), len(targets),
                                 ' '.join(targets))
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning(
                            'Giving up on target URL over %s chars! %s',
                            _MAX_STRING_LENGTH, t)
                        too_long.add(t[:_MAX_STRING_LENGTH - 4] + '...')

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(id=id,
                                   source=source.key,
                                   activities_json=[
                                       json.dumps(
                                           util.prune_activity(a, source))
                                       for a in activities
                                   ],
                                   response_json=json.dumps(pruned_response),
                                   type=resp_type,
                                   unsent=list(urls_to_activity.keys()),
                                   failed=list(too_long),
                                   original_posts=resp.get('originals', []))
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates['seen_responses_cache_json'] = json.dumps(
                pruned_responses + unchanged_responses)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if (isinstance(e, (datastore_errors.BadRequestError,
                                       datastore_errors.Timeout))
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     exc_info=True)
                    else:
                        raise
Exemple #19
0
  def poll(self, source):
    """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
    if source.last_activities_etag or source.last_activity_id:
      logging.debug('Using ETag %s, last activity id %s',
                    source.last_activities_etag, source.last_activity_id)

    #
    # Step 1: fetch activities:
    # * posts by the user
    # * search all posts for the user's domain URLs to find links
    #
    cache = util.CacheDict()
    if source.last_activities_cache_json:
      cache.update(json.loads(source.last_activities_cache_json))

    # search for links first so that the user's activities and responses
    # override them if they overlap
    links = source.search_for_links()

    # this user's own activities (and user mentions)
    resp = source.get_activities_response(
      fetch_replies=True, fetch_likes=True, fetch_shares=True,
      fetch_mentions=True, count=50, etag=source.last_activities_etag,
      min_id=source.last_activity_id, cache=cache)
    etag = resp.get('etag')  # used later
    user_activities = resp.get('items', [])

    # these map ids to AS objects
    responses = {a['id']: a for a in links}
    activities = {a['id']: a for a in links + user_activities}

    # extract silo activity ids, update last_activity_id
    silo_activity_ids = set()
    last_activity_id = source.last_activity_id
    for id, activity in activities.items():
      # maybe replace stored last activity id
      parsed = util.parse_tag_uri(id)
      if parsed:
        id = parsed[1]
      silo_activity_ids.add(id)
      try:
        # try numeric comparison first
        greater = int(id) > int(last_activity_id)
      except (TypeError, ValueError):
        greater = id > last_activity_id
      if greater:
        last_activity_id = id

    if last_activity_id and last_activity_id != source.last_activity_id:
      source.updates['last_activity_id'] = last_activity_id

    # trim cache to just the returned activity ids, so that it doesn't grow
    # without bound. (WARNING: depends on get_activities_response()'s cache key
    # format, e.g. 'PREFIX ACTIVITY_ID'!)
    source.updates['last_activities_cache_json'] = json.dumps(
      {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids})

    self.backfeed(source, responses, activities=activities)

    source.updates.update({'last_polled': source.last_poll_attempt,
                           'poll_status': 'ok'})
    if etag and etag != source.last_activities_etag:
      source.updates['last_activities_etag'] = etag

    #
    # Possibly refetch updated syndication urls.
    #
    # if the author has added syndication urls since the first time
    # original_post_discovery ran, we'll miss them. this cleanup task will
    # periodically check for updated urls. only kicks in if the author has
    # *ever* published a rel=syndication url
    if source.should_refetch():
      logging.info('refetching h-feed for source %s', source.label())
      relationships = original_post_discovery.refetch(source)

      now = util.now_fn()
      source.updates['last_hfeed_refetch'] = now

      if relationships:
        logging.info('refetch h-feed found new rel=syndication relationships: %s',
                     relationships)
        try:
          self.repropagate_old_responses(source, relationships)
        except BaseException, e:
          if (isinstance(e, (datastore_errors.BadRequestError,
                             datastore_errors.Timeout)) or
              util.is_connection_failure(e)):
            logging.info('Timeout while repropagating responses.', exc_info=True)
          else:
            raise
    def test_refetch_with_updated_permalink(self):
        """Permalinks can change (e.g., if a stub is added or modified).

    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
        self.activities[0]["object"].update(
            {"content": "post content without backlinks", "url": "https://fa.ke/post/url"}
        )

        # first attempt, no stub yet
        self.expect_requests_get(
            "http://author",
            """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09"></a>
    </html>""",
        )
        self.expect_requests_get(
            "http://author/2014/08/09",
            """
    <html class="h-entry">
    <a class="u-url" href="/2014/08/09"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>""",
        )

        # refetch, permalink has a stub now
        self.expect_requests_get(
            "http://author",
            """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a>
    </html>""",
        )

        self.expect_requests_get(
            "http://author/2014/08/09/this-is-a-stub",
            """
    <html class="h-entry">
    <a class="u-url" href="/2014/08/09/this-is-a-stub"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>""",
        )

        # refetch again
        self.expect_requests_get(
            "http://author",
            """
    <html class="h-feed">
    <a class="h-entry" href="/2014/08/09/this-is-a-stub"></a>
    </html>""",
        )

        # permalink hasn't changed
        self.expect_requests_get(
            "http://author/2014/08/09/this-is-a-stub",
            """
    <html class="h-entry">
    <a class="u-url" href="/2014/08/09/this-is-a-stub"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>""",
        )

        self.mox.ReplayAll()
        # modified activity should have /2014/08/09 as an upstreamDuplicate now
        self.assert_discover(["http://author/2014/08/09"])

        # refetch should find the updated original url -> syndication url.
        # it should *not* find the previously discovered relationship.
        first_results = refetch(self.source)
        self.assertEquals(1, len(first_results))
        new_relations = first_results.get("https://fa.ke/post/url")
        self.assertEquals(1, len(new_relations))
        self.assertEquals("https://fa.ke/post/url", new_relations[0].syndication)
        self.assertEquals("http://author/2014/08/09/this-is-a-stub", new_relations[0].original)

        # second refetch should find nothing because nothing has changed
        # since the previous refetch.
        self.assertFalse(refetch(self.source))
Exemple #21
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug("Using ETag %s, last activity id %s", source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json.loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(
            fetch_replies=True,
            fetch_likes=True,
            fetch_shares=True,
            fetch_mentions=True,
            count=50,
            etag=source.last_activities_etag,
            min_id=source.last_activity_id,
            cache=cache,
        )
        etag = resp.get("etag")  # used later
        user_activities = resp.get("items", [])

        # these map ids to AS objects
        responses = {a["id"]: a for a in links}
        activities = {a["id"]: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = id > last_activity_id
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates["last_activity_id"] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates["last_activities_cache_json"] = json.dumps(
            {k: v for k, v in cache.items() if k.split()[-1] in silo_activity_ids}
        )

        # Cache to make sure we only fetch the author's h-feed(s) the
        # first time we see it
        fetched_hfeeds = set()

        # narrow down to just public activities
        public = {}
        private = {}
        for id, activity in activities.items():
            (public if source.is_activity_public(activity) else private)[id] = activity
        logging.info("Found %d public activities: %s", len(public), public.keys())
        logging.info("Found %d private activities: %s", len(private), private.keys())

        last_public_post = (source.last_public_post or util.EPOCH).isoformat()
        public_published = util.trim_nulls([a.get("published") for a in public.values()])
        if public_published:
            max_published = max(public_published)
            if max_published > last_public_post:
                last_public_post = max_published
                source.updates["last_public_post"] = util.as_utc(util.parse_iso8601(max_published))

        source.updates["recent_private_posts"] = len(
            [a for a in private.values() if a.get("published", util.EPOCH_ISO) > last_public_post]
        )

        #
        # Step 2: extract responses, store their activities in response['activities']
        #
        # WARNING: this creates circular references in link posts found by search
        # queries in step 1, since they are their own activity. We use
        # prune_activity() and prune_response() in step 4 to remove these before
        # serializing to JSON.
        #
        for id, activity in public.items():
            obj = activity.get("object") or activity

            # handle user mentions
            user_id = source.user_tag_id()
            if obj.get("author", {}).get("id") != user_id:
                for tag in obj.get("tags", []):
                    urls = tag.get("urls")
                    if tag.get("objectType") == "person" and tag.get("id") == user_id and urls:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                        activity["mentions"].update(u.get("value") for u in urls)
                        responses[id] = activity
                        break

            # handle quote mentions
            for att in obj.get("attachments", []):
                if (
                    att.get("objectType") in ("note", "article")
                    and att.get("author", {}).get("id") == source.user_tag_id()
                ):
                    # now that we've confirmed that one exists, OPD will dig
                    # into the actual attachments
                    if "originals" not in activity or "mentions" not in activity:
                        activity["originals"], activity["mentions"] = original_post_discovery.discover(
                            source,
                            activity,
                            fetch_hfeed=True,
                            include_redirect_sources=False,
                            already_fetched_hfeeds=fetched_hfeeds,
                        )
                    responses[id] = activity
                    break

            # extract replies, likes, reactions, reposts, and rsvps
            replies = obj.get("replies", {}).get("items", [])
            tags = obj.get("tags", [])
            likes = [t for t in tags if Response.get_type(t) == "like"]
            reactions = [t for t in tags if Response.get_type(t) == "react"]
            reposts = [t for t in tags if Response.get_type(t) == "repost"]
            rsvps = Source.get_rsvps_from_event(obj)

            # coalesce responses. drop any without ids
            for resp in replies + likes + reactions + reposts + rsvps:
                id = resp.get("id")
                if not id:
                    logging.error("Skipping response without id: %s", json.dumps(resp, indent=2))
                    continue

                resp.setdefault("activities", []).append(activity)

                # when we find two responses with the same id, the earlier one may have
                # come from a link post or user mention, and this one is probably better
                # since it probably came from the user's activity, so prefer this one.
                # background: https://github.com/snarfed/bridgy/issues/533
                existing = responses.get(id)
                if existing:
                    if source.gr_source.activity_changed(resp, existing, log=True):
                        logging.warning("Got two different versions of same response!\n%s\n%s", existing, resp)
                    resp["activities"].extend(existing.get("activities", []))

                responses[id] = resp

        #
        # Step 3: filter out responses we've already seen
        #
        # seen responses (JSON objects) for each source are stored in its entity.
        unchanged_responses = []
        if source.seen_responses_cache_json:
            for seen in json.loads(source.seen_responses_cache_json):
                id = seen["id"]
                resp = responses.get(id)
                if resp and not source.gr_source.activity_changed(seen, resp, log=True):
                    unchanged_responses.append(seen)
                    del responses[id]

        #
        # Step 4: store new responses and enqueue propagate tasks
        #
        pruned_responses = []
        for id, resp in responses.items():
            resp_type = Response.get_type(resp)
            activities = resp.pop("activities", [])
            if not activities and resp_type == "post":
                activities = [resp]
            too_long = set()
            urls_to_activity = {}
            for i, activity in enumerate(activities):
                # we'll usually have multiple responses for the same activity, and the
                # objects in resp['activities'] are shared, so cache each activity's
                # discovered webmention targets inside its object.
                if "originals" not in activity or "mentions" not in activity:
                    activity["originals"], activity["mentions"] = original_post_discovery.discover(
                        source,
                        activity,
                        fetch_hfeed=True,
                        include_redirect_sources=False,
                        already_fetched_hfeeds=fetched_hfeeds,
                    )

                targets = original_post_discovery.targets_for_response(
                    resp, originals=activity["originals"], mentions=activity["mentions"]
                )
                if targets:
                    logging.info(
                        "%s has %d webmention target(s): %s", activity.get("url"), len(targets), " ".join(targets)
                    )
                for t in targets:
                    if len(t) <= _MAX_STRING_LENGTH:
                        urls_to_activity[t] = i
                    else:
                        logging.warning("Giving up on target URL over %s chars! %s", _MAX_STRING_LENGTH, t)
                        too_long.add(t[: _MAX_STRING_LENGTH - 4] + "...")

            # store/update response entity. the prune_*() calls are important to
            # remove circular references in link responses, which are their own
            # activities. details in the step 2 comment above.
            pruned_response = util.prune_response(resp)
            pruned_responses.append(pruned_response)
            resp_entity = Response(
                id=id,
                source=source.key,
                activities_json=[json.dumps(util.prune_activity(a, source)) for a in activities],
                response_json=json.dumps(pruned_response),
                type=resp_type,
                unsent=list(urls_to_activity.keys()),
                failed=list(too_long),
                original_posts=resp.get("originals", []),
            )
            if urls_to_activity and len(activities) > 1:
                resp_entity.urls_to_activity = json.dumps(urls_to_activity)
            resp_entity.get_or_save(source)

        # update cache
        if pruned_responses:
            source.updates["seen_responses_cache_json"] = json.dumps(pruned_responses + unchanged_responses)

        source.updates.update({"last_polled": source.last_poll_attempt, "poll_status": "ok"})
        if etag and etag != source.last_activities_etag:
            source.updates["last_activities_etag"] = etag

        #
        # Step 5. possibly refetch updated syndication urls
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info("refetching h-feed for source %s", source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates["last_hfeed_refetch"] = now

            if relationships:
                logging.info("refetch h-feed found new rel=syndication relationships: %s", relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException, e:
                    if isinstance(
                        e, (datastore_errors.BadRequestError, datastore_errors.Timeout)
                    ) or util.is_connection_failure(e):
                        logging.info("Timeout while repropagating responses.", exc_info=True)
                    else:
                        raise
Exemple #22
0
    def poll(self, source):
        """Actually runs the poll.

    Stores property names and values to update in source.updates.
    """
        if source.last_activities_etag or source.last_activity_id:
            logging.debug('Using ETag %s, last activity id %s',
                          source.last_activities_etag, source.last_activity_id)

        #
        # Step 1: fetch activities:
        # * posts by the user
        # * search all posts for the user's domain URLs to find links
        #
        cache = util.CacheDict()
        if source.last_activities_cache_json:
            cache.update(json_loads(source.last_activities_cache_json))

        # search for links first so that the user's activities and responses
        # override them if they overlap
        links = source.search_for_links()

        # this user's own activities (and user mentions)
        resp = source.get_activities_response(fetch_replies=True,
                                              fetch_likes=True,
                                              fetch_shares=True,
                                              fetch_mentions=True,
                                              count=50,
                                              etag=source.last_activities_etag,
                                              min_id=source.last_activity_id,
                                              cache=cache)
        etag = resp.get('etag')  # used later
        user_activities = resp.get('items', [])

        # these map ids to AS objects
        responses = {a['id']: a for a in links}
        activities = {a['id']: a for a in links + user_activities}

        # extract silo activity ids, update last_activity_id
        silo_activity_ids = set()
        last_activity_id = source.last_activity_id
        for id, activity in activities.items():
            # maybe replace stored last activity id
            parsed = util.parse_tag_uri(id)
            if parsed:
                id = parsed[1]
            silo_activity_ids.add(id)
            try:
                # try numeric comparison first
                greater = int(id) > int(last_activity_id)
            except (TypeError, ValueError):
                greater = str(id) > str(last_activity_id)
            if greater:
                last_activity_id = id

        if last_activity_id and last_activity_id != source.last_activity_id:
            source.updates['last_activity_id'] = last_activity_id

        # trim cache to just the returned activity ids, so that it doesn't grow
        # without bound. (WARNING: depends on get_activities_response()'s cache key
        # format, e.g. 'PREFIX ACTIVITY_ID'!)
        source.updates['last_activities_cache_json'] = json_dumps({
            k: v
            for k, v in cache.items() if k.split()[-1] in silo_activity_ids
        })

        self.backfeed(source, responses, activities=activities)

        source.updates.update({
            'last_polled': source.last_poll_attempt,
            'poll_status': 'ok'
        })
        if etag and etag != source.last_activities_etag:
            source.updates['last_activities_etag'] = etag

        #
        # Possibly refetch updated syndication urls.
        #
        # if the author has added syndication urls since the first time
        # original_post_discovery ran, we'll miss them. this cleanup task will
        # periodically check for updated urls. only kicks in if the author has
        # *ever* published a rel=syndication url
        if source.should_refetch():
            logging.info('refetching h-feed for source %s', source.label())
            relationships = original_post_discovery.refetch(source)

            now = util.now_fn()
            source.updates['last_hfeed_refetch'] = now

            if relationships:
                logging.info(
                    'refetch h-feed found new rel=syndication relationships: %s',
                    relationships)
                try:
                    self.repropagate_old_responses(source, relationships)
                except BaseException as e:
                    if ('BadRequestError' in str(e.__class__)
                            or 'Timeout' in str(e.__class__)
                            or util.is_connection_failure(e)):
                        logging.info('Timeout while repropagating responses.',
                                     stack_info=True)
                    else:
                        raise
        else:
            logging.info(
                'skipping refetch h-feed. last-syndication-url %s, last-refetch %s',
                source.last_syndication_url, source.last_hfeed_refetch)
Exemple #23
0
    def receive(self, email):
        addr = self.request.path.split('/')[-1]
        message_id = email.original.get('message-id').strip('<>')
        sender = getattr(email, 'sender', None)
        to = getattr(email, 'to', None)
        cc = getattr(email, 'cc', None)
        subject = getattr(email, 'subject', None)
        logging.info('Received %s from %s to %s (%s) cc %s: %s', message_id,
                     sender, to, addr, cc, subject)

        addr = self.request.path.split('/')[-1]
        user = addr.split('@')[0]
        source = FacebookEmailAccount.query(
            FacebookEmailAccount.email_user == user).get()
        logging.info('Source for %s is %s', user, source)

        util.email_me(subject='New email from %s: %s' % (sender, subject),
                      body='Source: %s' %
                      (source.bridgy_url(self) if source else None))

        htmls = list(body.decode() for _, body in email.bodies('text/html'))
        fbe = FacebookEmail.get_or_insert(
            message_id, source=source.key if source else None, htmls=htmls)
        logging.info('FacebookEmail created %s: %s', fbe.created,
                     fbe.key.urlsafe())

        if not source:
            self.response.status_code = 404
            self.response.write(
                'No Facebook email user found with address %s' % addr)
            return

        for html in htmls:
            obj = gr_facebook.Facebook.email_to_object(html)
            if obj:
                break
        else:
            self.response.status_code = 400
            self.response.write('No HTML body could be parsed')
            return
        logging.info('Converted to AS1: %s', json.dumps(obj, indent=2))

        base_obj = source.gr_source.base_object(obj)
        # note that this ignores the id query param (the post's user id) and uses
        # the source object's user id instead.
        base_obj['url'] = source.canonicalize_url(base_obj['url'])
        # also note that base_obj['id'] is not a tag URI, it's the raw Facebook post
        # id, eg '104790764108207'. we don't use it from activities_json much,
        # though, just in PropagateResponse.source_url(), which handles this fine.

        original_post_discovery.refetch(source)
        targets, mentions = original_post_discovery.discover(source,
                                                             base_obj,
                                                             fetch_hfeed=False)
        logging.info('Got targets %s mentions %s', targets, mentions)

        resp = Response(id=obj['id'],
                        source=source.key,
                        type=Response.get_type(obj),
                        response_json=json.dumps(obj),
                        activities_json=[json.dumps(base_obj)],
                        unsent=targets)
        resp.get_or_save(source, restart=True)

        fbe.response = resp.key
        fbe.put()