Ejemplo n.º 1
0
  def test_syndication_url_in_hfeed(self):
    """Like test_single_post, but because the syndication URL is given in
    the h-feed we skip fetching the permalink. New behavior as of
    2014-11-08
    """
    self.activity['object']['upstreamDuplicates'] = ['existing uD']

    # silo domain is fa.ke
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/post/url">
      </div>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', self.source, self.activity)
    original_post_discovery.discover(self.source, self.activity)

    # upstreamDuplicates = 1 original + 1 discovered
    self.assertEquals(['existing uD', 'http://author/post/permalink'],
                      self.activity['object']['upstreamDuplicates'])

    origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)]
    self.assertEquals([u'http://author/post/permalink'], origurls)

    # for now only syndicated posts belonging to this source are stored
    syndurls = list(r.syndication for r
                    in SyndicatedPost.query(ancestor=self.source.key))

    self.assertEquals([u'https://fa.ke/post/url'], syndurls)
Ejemplo n.º 2
0
  def test_multiple_refetches(self):
    """Ensure that multiple refetches of the same post (with and without
    u-syndication) does not generate duplicate blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for details
    """
    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""

    unsyndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    </html>"""

    syndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>"""

    # first attempt, no syndication url yet
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # refetch, still no syndication url
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # second refetch, has a syndication url this time
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', syndicated)

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activities[0])
    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertIsNone(relations[0].syndication)

    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
  def test_multiple_rel_feeds(self):
    """Make sure that we follow all rel=feed links, e.g. if notes and
    articles are in separate feeds."""

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" href="/articles" type="text/html">
        <link rel="feed" href="/notes" type="text/html">
      </head>
    </html>""")

    # fetches all feeds first
    self.expect_requests_get('http://author/articles', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/article-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    self.expect_requests_get('http://author/notes', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/note-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    # then the permalinks (in any order since they are hashed to
    # remove duplicates)
    self.expect_requests_get('http://author/article-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/article-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/article"></a>
    </html>""").InAnyOrder('permalink')

    self.expect_requests_get('http://author/note-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/note-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/note"></a>
    </html>""").InAnyOrder('permalink')

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activity)

    note_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/note-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(note_rels))
    self.assertEqual('https://fa.ke/note', note_rels[0].syndication)

    article_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/article-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(article_rels))
    self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
  def test_refetch_multiple_responses_same_activity(self):
    """Ensure that refetching a post that has several replies does not
    generate duplicate original -> None blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for
    details
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    for activity in self.activities:
        activity['object']['content'] = 'post content without backlinks'
        activity['object']['url'] = 'https://fa.ke/post/url'

    author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>"""

    author_entry = """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </html>"""

    # original
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    # refetch
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    self.mox.ReplayAll()

    for activity in self.activities:
      original_post_discovery.discover(source, activity)

    original_post_discovery.refetch(source)

    rels_by_original = list(
      SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_original))
    self.assertIsNone(rels_by_original[0].syndication)

    rels_by_syndication = list(
      SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_syndication))
    self.assertIsNone(rels_by_syndication[0].original)
    def test_single_post(self):
        """Test that original post discovery does the reverse lookup to scan
    author's h-feed for rel=syndication links
    """
        activity = self.activities[0]
        activity['object'].update({
            'content': 'post content without backlink',
            'url': 'http://fa.ke/post/url',
            'upstreamDuplicates': ['existing uD'],
        })

        # silo domain is fa.ke
        source = self.sources[0]
        source.domain_urls = ['http://author']

        self.expect_requests_get(
            'http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>""")

        # syndicated to two places
        self.expect_requests_get(
            'http://author/post/permalink', """
    <link rel="syndication" href="http://not.real/statuses/postid">
    <link rel="syndication" href="http://fa.ke/post/url">
    <div class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </div>""")

        self.mox.ReplayAll()
        logging.debug('Original post discovery %s -> %s', source, activity)
        original_post_discovery.discover(source, activity)

        # upstreamDuplicates = 1 original + 1 discovered
        self.assertEquals(['existing uD', 'http://author/post/permalink'],
                          activity['object']['upstreamDuplicates'])

        origurls = [
            r.original for r in SyndicatedPost.query(ancestor=source.key)
        ]
        self.assertEquals([u'http://author/post/permalink'], origurls)

        # for now only syndicated posts belonging to this source are stored
        syndurls = list(
            r.syndication for r in SyndicatedPost.query(ancestor=source.key))

        self.assertEquals([u'https://fa.ke/post/url'], syndurls)
Ejemplo n.º 6
0
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
  def _test_failed_post_permalink_fetch(self, raise_exception):
    """Make sure something reasonable happens when we're unable to fetch
    the permalink of an entry linked in the h-feed
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="nonexistent.html"></a>
      </article>
    </html>
    """)

    if raise_exception:
      self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError())
    else:
      self.expect_requests_get('http://author/nonexistent.html', status_code=410)

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # we should have saved placeholders to prevent us from trying the
    # syndication url or permalink again
    self.assert_equals(
      set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]),
      set((relationship.original, relationship.syndication)
          for relationship in SyndicatedPost.query(ancestor=source.key)))
Ejemplo n.º 8
0
    def test_discover_url_site_post_syndication_links(self):
        self.expect_requests_get(
            'http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")

        self.expect_task('discover', source_key=self.source, post_id='222')
        self.expect_task('discover', source_key=self.source, post_id='444')
        self.mox.ReplayAll()

        self.assertEqual(0, SyndicatedPost.query().count())
        self.check_discover(
            'http://si.te/123',
            'Discovering now. Refresh in a minute to see the results!')

        self.assertCountEqual([
            {
                'https://fa.ke/222': 'http://si.te/123'
            },
            {
                'https://fa.ke/post/444': 'http://si.te/123'
            },
        ], [{
            sp.syndication: sp.original
        } for sp in models.SyndicatedPost.query()])

        now = util.now_fn()
        source = self.source.key.get()
        self.assertEqual(now, source.last_syndication_url)
  def test_no_h_entries(self):
    """Make sure nothing bad happens when fetching a feed without
    h-entries
    """
    activity = self.activities[0]
    activity['object']['content'] = 'post content without backlink'
    activity['object']['url'] = 'https://fa.ke/post/url'

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <p>under construction</p>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    self.assert_equals(
      [(None, 'https://fa.ke/post/url')],
      [(relationship.original, relationship.syndication)
       for relationship in SyndicatedPost.query(ancestor=source.key)])
Ejemplo n.º 10
0
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
Ejemplo n.º 11
0
 def test_do_not_fetch_hfeed(self):
   """Confirms behavior of discover() when fetch_hfeed=False.
   Discovery should only check the database for previously discovered matches.
   It should not make any GET requests
   """
   discover(self.source, self.activity, fetch_hfeed=False)
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
Ejemplo n.º 12
0
 def test_no_author_url(self):
   """Make sure something reasonable happens when the author doesn't have
   a url at all.
   """
   self.source.domain_urls = []
   discover(self.source, self.activity)
   # nothing attempted, and no SyndicatedPost saved
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
Ejemplo n.º 13
0
  def test_merge_front_page_and_h_feed(self):
    """Make sure we are correctly merging the front page and rel-feed by
    checking that we visit h-entries that are only the front page or
    only the rel-feed page.
    """
    activity = self.activities[0]
    activity['object'].update({
        'content': 'post content without backlink',
        'url': 'https://fa.ke/post/url',
        'upstreamDuplicates': ['existing uD'],
    })

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-frontpage"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
    </html>""")

    self.expect_requests_get('http://author/feed', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-feed"></a>
      </div>
    </html>""")

    for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'):
      self.expect_requests_get('http://author%s' % orig,
                               """<div class="h-entry">
                                 <a class="u-url" href="%s"></a>
                               </div>""" % orig).InAnyOrder()

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    # should be three blank SyndicatedPosts now
    for orig in ('http://author/only-on-frontpage',
                 'http://author/on-both',
                 'http://author/only-on-feed'):
      logging.debug('checking %s', orig)
      sp = SyndicatedPost.query(
        SyndicatedPost.original == orig,
        ancestor=source.key).get()
      self.assertTrue(sp)
      self.assertIsNone(sp.syndication)
Ejemplo n.º 14
0
    def test_insert_replaces_blanks(self):
        """Make sure we replace original=None with original=something
    when it is discovered"""

        # add a blank for the original too
        SyndicatedPost.insert_original_blank(
            self.source, 'http://original/newly-discovered')

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())

        r = SyndicatedPost.insert(self.source, 'http://silo/no-original',
                                  'http://original/newly-discovered')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/newly-discovered', r.original)

        # make sure it's in NDB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()
        self.assertEqual(1, len(rs))
        self.assertEqual('http://original/newly-discovered', rs[0].original)
        self.assertEqual('http://silo/no-original', rs[0].syndication)

        # and the blanks have been removed
        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())
Ejemplo n.º 15
0
  def test_insert_replaces_blanks(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""

    # add a blank for the original too
    SyndicatedPost.insert_original_blank(
      self.source, 'http://original/newly-discovered')

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())

    r = SyndicatedPost.insert(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)

    # and the blanks have been removed
    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())
Ejemplo n.º 16
0
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
  logging.info('starting posse post discovery with syndicated %s',
               syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()

  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # TODO: Consider using the actor's url, with get_author_urls() as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in _get_author_urls(source):
      if url not in already_fetched_hfeeds:
        results.update(_process_author(source, url))
        already_fetched_hfeeds.add(url)
      else:
        logging.debug('skipping %s, already fetched this round', url)

    relationships = results.get(syndication_url, [])

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)

  originals = [r.original for r in relationships if r.original]
  if originals:
    logging.debug('posse post discovery found relationship(s) %s -> %s',
                  syndication_url, originals)
  return originals
Ejemplo n.º 17
0
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
    """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
    logging.info('starting posse post discovery with syndicated %s',
                 syndication_url)

    relationships = SyndicatedPost.query(
        SyndicatedPost.syndication == syndication_url,
        ancestor=source.key).fetch()

    if not relationships and fetch_hfeed:
        # a syndicated post we haven't seen before! fetch the author's URLs to see
        # if we can find it.
        #
        # TODO: Consider using the actor's url, with get_author_urls() as the
        # fallback in the future to support content from non-Bridgy users.
        results = {}
        for url in _get_author_urls(source):
            if url not in already_fetched_hfeeds:
                results.update(_process_author(source, url))
                already_fetched_hfeeds.add(url)
            else:
                logging.debug('skipping %s, already fetched this round', url)

        relationships = results.get(syndication_url, [])

    if not relationships:
        # No relationships were found. Remember that we've seen this
        # syndicated post to avoid reprocessing it every time
        logging.debug('posse post discovery found no relationship for %s',
                      syndication_url)
        if fetch_hfeed:
            SyndicatedPost.insert_syndication_blank(source, syndication_url)

    originals = [r.original for r in relationships if r.original]
    if originals:
        logging.debug('posse post discovery found relationship(s) %s -> %s',
                      syndication_url, originals)
    return originals
Ejemplo n.º 18
0
    def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
        """Make sure we don't insert duplicate blank entries"""

        SyndicatedPost.insert_syndication_blank(self.source,
                                                'http://silo/no-original')

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([None], [rel.original for rel in rs])
Ejemplo n.º 19
0
  def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
    """Make sure we don't insert duplicate blank entries"""

    SyndicatedPost.insert_syndication_blank(
      self.source, 'http://silo/no-original')

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual([None], [rel.original for rel in rs])
Ejemplo n.º 20
0
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: models.Source subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info('starting posse post discovery with syndicated %s', syndication_url)
  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # Use source.domain_urls for now; it seems more reliable than the
    # activity.actor.url (which depends on getting the right data back from
    # various APIs). Consider using the actor's url, with domain_urls as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in source.get_author_urls():
      results.update(_process_author(source, url))
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(unicode(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity
Ejemplo n.º 21
0
  def test_no_author_url(self):
    """Make sure something reasonable happens when the author doesn't have
    a url at all.
    """
    source = self.sources[0]
    source.domain_urls = []
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # nothing attempted, and no SyndicatedPost saved
    self.assertFalse(SyndicatedPost.query(ancestor=source.key).get())
Ejemplo n.º 22
0
    def test_insert_no_duplicates(self):
        """Make sure we don't insert duplicate entries"""

        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/post/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/post/url', r.original)

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            SyndicatedPost.original == 'http://original/post/url',
            ancestor=self.source.key).fetch()

        self.assertEqual(1, len(rs))
Ejemplo n.º 23
0
def _posse_post_discovery(source, activity, author_url, syndication_url,
                          fetch_hfeed):
  """Performs the actual meat of the posse-post-discover. It was split
  out from discover() so that it can be done inside of a transaction.

  Args:
    source: models.Source subclass
    activity: activity dict
    author_url: author's url configured in their silo profile
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info(
      'starting posse post discovery with author %s and syndicated %s',
      author_url, syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's
    # h-feed to see if we can find it.
    results = _process_author(source, author_url)
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(str(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity
Ejemplo n.º 24
0
  def test_insert_no_duplicates(self):
    """Make sure we don't insert duplicate entries"""

    r = SyndicatedPost.insert(
      self.source, 'http://silo/post/url', 'http://original/post/url')
    self.assertIsNotNone(r)
    self.assertEqual('http://original/post/url', r.original)

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
      SyndicatedPost.syndication == 'http://silo/post/url',
      SyndicatedPost.original == 'http://original/post/url',
      ancestor=self.source.key
    ).fetch()

    self.assertEqual(1, len(rs))
Ejemplo n.º 25
0
  def test_get_or_insert_by_syndication_replace(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)
Ejemplo n.º 26
0
  def test_refetch_unchanged_syndication(self):
    """We should preserve unchanged SyndicatedPosts during refetches."""
    synd = SyndicatedPost(parent=self.source.key,
                          original='http://author/permalink',
                          syndication='https://fa.ke/post/url')
    synd.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    refetch(self.source)
    self.assert_entities_equal([synd], list(SyndicatedPost.query()))
Ejemplo n.º 27
0
  def test_refetch_two_permalinks_same_syndication(self):
    """
    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/post1"></a>
    <a class="h-entry" href="/post2"></a>
    </html>"""

    self.expect_requests_get('http://author', hfeed)

    for i in range(2):
      self.expect_requests_get(
        'http://author/post%d' % (i + 1),
        """<html class="h-entry">
        <a class="u-url" href="/post%d"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
        </html>""" % (i + 1))

    # refetch should only grab the feed
    self.expect_requests_get('http://author', hfeed)

    self.mox.ReplayAll()
    activity = original_post_discovery.discover(source, self.activities[0])
    self.assertItemsEqual(['http://author/post1', 'http://author/post2'],
                          activity['object'].get('upstreamDuplicates'))

    relations = SyndicatedPost.query(ancestor=source.key).fetch()
    self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'),
                           ('http://author/post2', 'https://fa.ke/post/url')],
                          [(relation.original, relation.syndication)
                           for relation in relations])

    # discover should have already handled all relationships, refetch should
    # not find anything
    refetch_result = original_post_discovery.refetch(source)
    self.assertFalse(refetch_result)
Ejemplo n.º 28
0
    def test_insert_auguments_existing(self):
        """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/different/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/different/url', r.original)

        # make sure they're both in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([
            'http://original/post/url', 'http://original/another/post',
            'http://original/different/url'
        ], [rel.original for rel in rs])
Ejemplo n.º 29
0
  def test_refetch_permalink_with_two_syndications(self):
    """Test one permalink with two syndicated posts. Make sure that
    refetch doesn't have a problem with two entries for the same
    original URL.
    """
    for idx, activity in enumerate(self.activities):
      activity['object'].update({
        'content': 'post content without backlinks',
        'url': 'https://fa.ke/post/url%d' % (idx + 1),
      })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""
    hentry = """<html class="h-entry">
    <a class="u-url" href="/permalink"/>
    <a class="u-syndication" href="https://fa.ke/post/url1"/>
    <a class="u-syndication" href="https://fa.ke/post/url3"/>
    <a class="u-syndication" href="https://fa.ke/post/url5"/>
    </html>"""

    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', hentry)

    # refetch
    self.expect_requests_get('http://author', hfeed)
    # refetch grabs posts that it's seen before in case there have
    # been updates
    self.expect_requests_get('http://author/permalink', hentry)

    self.mox.ReplayAll()

    original_post_discovery.discover(self.source, self.activities[0])
    relations = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink',
      ancestor=self.source.key).fetch()
    self.assertItemsEqual(
      [('http://author/permalink', 'https://fa.ke/post/url1'),
       ('http://author/permalink', 'https://fa.ke/post/url3'),
       ('http://author/permalink', 'https://fa.ke/post/url5')],
      [(r.original, r.syndication) for r in relations])

    results = original_post_discovery.refetch(self.source)
    self.assertFalse(results)
Ejemplo n.º 30
0
  def test_refetch_changed_syndication(self):
    """Update syndication links that have changed since our last fetch."""
    SyndicatedPost(parent=self.source.key,
                   original='http://author/permalink',
                   syndication='https://fa.ke/post/url').put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/changed/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    results = refetch(self.source)
    self.assert_syndicated_posts(
      ('http://author/permalink', 'https://fa.ke/changed/url'))
    self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())},
                       results)
Ejemplo n.º 31
0
  def test_get_or_insert_by_syndication_do_not_replace(self):
    """Make sure we don't replace original=something with
    original=something else (in practice, that would mean another task
    is running discovery concurrently and found a different url)
    """
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/post/url', r.original)

    # make sure it's unchanged in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/post/url', rs[0].original)
    self.assertEquals('http://silo/post/url', rs[0].syndication)
Ejemplo n.º 32
0
  def test_insert_auguments_existing(self):
    """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
    r = SyndicatedPost.insert(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/different/url', r.original)

    # make sure they're both in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual(['http://original/post/url',
                           'http://original/another/post',
                           'http://original/different/url'],
                          [rel.original for rel in rs])
Ejemplo n.º 33
0
  def test_refetch_blank_syndication(self):
    """We should preserve blank SyndicatedPosts during refetches."""
    blank = SyndicatedPost(parent=self.source.key,
                           original='http://author/permalink',
                           syndication=None)
    blank.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink" />
      </div>
    </html>""")
    self.expect_requests_get('http://author/permalink', """
      <html class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </html>""")

    self.mox.ReplayAll()
    self.assert_equals({}, original_post_discovery.refetch(self.source))
    self.assert_syndicated_posts(('http://author/permalink', None))
    self.assert_entities_equal([blank], list(SyndicatedPost.query()))
Ejemplo n.º 34
0
  def test_invalid_webmention_target(self):
    """Confirm that no additional requests are made if the author url is
    an invalid webmention target. Right now this pretty much just
    means they're on the blacklist. Eventually we want to filter out
    targets that don't have certain features, like a webmention
    endpoint or microformats.
    """
    source = self.sources[0]
    source.domain_urls = ['http://amazon.com']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.mox.ReplayAll()

    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    # nothing attempted, but we should have saved a placeholder to prevent us
    # from trying again
    self.assert_equals(
      [(None, 'https://fa.ke/post/url')],
      [(relationship.original, relationship.syndication)
       for relationship in SyndicatedPost.query(ancestor=source.key)])
Ejemplo n.º 35
0
  def _test_failed_domain_url_fetch(self, raise_exception):
    """Make sure something reasonable happens when the author's domain url
    gives an unexpected response
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    if raise_exception:
      self.expect_requests_get('http://author').AndRaise(HTTPError())
    else:
      self.expect_requests_get('http://author', status_code=404)

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # nothing attempted, but we should have saved a placeholder to prevent us
    # from trying again
    self.assert_equals(
      [(None, 'https://fa.ke/post/url')],
      [(relationship.original, relationship.syndication)
       for relationship in SyndicatedPost.query(ancestor=source.key)])
Ejemplo n.º 36
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
  """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of models.Source
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new models.SyndicatedPost
  """
  # for now use whether the url is a valid webmention target
  # as a proxy for whether it's worth searching it.
  # TODO skip sites we know don't have microformats2 markup
  author_url, _, ok = util.get_webmention_target(author_url)
  if not ok:
    return {}

  try:
    logging.debug('fetching author url %s', author_url)
    author_resp = util.requests_get(author_url)
    # TODO for error codes that indicate a temporary error, should we make
    # a certain number of retries before giving up forever?
    author_resp.raise_for_status()
    author_dom = BeautifulSoup(author_resp.text)
  except AssertionError:
    raise  # for unit tests
  except BaseException:
    # TODO limit allowed failures, cache the author's h-feed url
    # or the # of times we've failed to fetch it
    logging.warning('Could not fetch author url %s', author_url, exc_info=True)
    return {}

  feeditems = _find_feed_items(author_url, author_dom)

  # look for all other feed urls using rel='feed', type='text/html'
  feed_urls = set()
  for rel_feed_node in (author_dom.find_all('link', rel='feed')
                        + author_dom.find_all('a', rel='feed')):
    feed_url = rel_feed_node.get('href')
    if not feed_url:
      continue

    feed_url = urlparse.urljoin(author_url, feed_url)
    feed_type = rel_feed_node.get('type')
    if not feed_type:
      # type is not specified, use this to confirm that it's text/html
      feed_url, _, feed_type_ok = util.get_webmention_target(feed_url)
    else:
      feed_type_ok = feed_type == 'text/html'

    if feed_url == author_url:
      logging.debug('author url is the feed url, ignoring')
    elif not feed_type_ok:
      logging.debug('skipping feed of type %s', feed_type)
    else:
      feed_urls.add(feed_url)

  for feed_url in feed_urls:
    try:
      logging.debug("fetching author's rel-feed %s", feed_url)
      feed_resp = util.requests_get(feed_url)
      feed_resp.raise_for_status()
      logging.debug("author's rel-feed fetched successfully %s", feed_url)
      feeditems = _merge_hfeeds(feeditems,
                                _find_feed_items(feed_url, feed_resp.text))

      domain = util.domain_from_link(feed_url)
      if source.updates is not None and domain not in source.domains:
        domains = source.updates.setdefault('domains', source.domains)
        if domain not in domains:
          logging.info('rel-feed found new domain %s! adding to source', domain)
          domains.append(domain)

    except AssertionError:
      raise  # reraise assertions for unit tests
    except BaseException:
      logging.warning('Could not fetch h-feed url %s.', feed_url,
                      exc_info=True)

  permalink_to_entry = {}
  for child in feeditems:
    if 'h-entry' in child['type']:
      # TODO maybe limit to first ~30 entries? (do that here rather than,
      # below because we want the *first* n entries)
      for permalink in child['properties'].get('url', []):
        if isinstance(permalink, basestring):
          permalink_to_entry[permalink] = child
        else:
          logging.warn('unexpected non-string "url" property: %s', permalink)

  # query all preexisting permalinks at once, instead of once per link
  permalinks_list = list(permalink_to_entry.keys())
  # fetch the maximum allowed entries (currently 30) at a time
  preexisting_list = itertools.chain.from_iterable(
    SyndicatedPost.query(
      SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
      ancestor=source.key)
    for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
  preexisting = {}
  for r in preexisting_list:
    preexisting.setdefault(r.original, []).append(r)

  results = {}
  for permalink, entry in permalink_to_entry.iteritems():
    logging.debug('processing permalink: %s', permalink)
    new_results = _process_entry(
      source, permalink, entry, refetch, preexisting.get(permalink, []),
      store_blanks=store_blanks)
    for key, value in new_results.iteritems():
      results.setdefault(key, []).extend(value)

  if source.updates is not None and results:
    # keep track of the last time we've seen rel=syndication urls for
    # this author. this helps us decide whether to refetch periodically
    # and look for updates.
    # Source will be saved at the end of each round of polling
    now = util.now_fn()
    logging.debug('updating source last_syndication_url %s', now)
    source.updates['last_syndication_url'] = now

  return results
Ejemplo n.º 37
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    logger.debug(f'fetching author url {author_url}')
    try:
        author_mf2 = util.fetch_mf2(author_url)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logger.info(f'Could not fetch author url {author_url}', exc_info=True)
        return {}

    feeditems = _find_feed_items(author_mf2)

    # try rel=feeds and rel=alternates
    feed_urls = set()
    candidates = (author_mf2['rels'].get('feed', []) + [
        a.get('url') for a in author_mf2.get('alternates', [])
        if a.get('type') == MF2_HTML_MIME_TYPE
    ])
    for feed_url in candidates:
        # check that it's html, not too big, etc
        feed_url, _, feed_ok = util.get_webmention_target(feed_url)
        if feed_url == author_url:
            logger.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logger.debug("skipping feed since it's not HTML or otherwise bad")
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logger.debug(f"fetching author's rel-feed {feed_url}")
            feed_mf2 = util.fetch_mf2(feed_url)
            feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2))
            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logger.info(
                        f'rel-feed found new domain {domain}! adding to source'
                    )
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logger.info(f'Could not fetch h-feed url {feed_url}.',
                        exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published') or ''

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logger.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, str):
                    permalink_to_entry[permalink] = child
                else:
                    logger.warning(
                        f'unexpected non-string "url" property: {permalink}')

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logger.info(f'Hit cap of {max} permalinks. Stopping.')
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.items():
        logger.debug(f'processing permalink: {permalink}')
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.items():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
Ejemplo n.º 38
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    try:
        logging.debug('fetching author url %s', author_url)
        author_resp = util.requests_get(author_url)
        # TODO for error codes that indicate a temporary error, should we make
        # a certain number of retries before giving up forever?
        author_resp.raise_for_status()
        author_dom = util.beautifulsoup_parse(author_resp.text)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logging.info('Could not fetch author url %s',
                     author_url,
                     exc_info=True)
        return {}

    feeditems = _find_feed_items(author_url, author_dom)

    # look for all other feed urls using rel='feed', type='text/html'
    feed_urls = set()
    for rel_feed_node in (author_dom.find_all('link', rel='feed') +
                          author_dom.find_all('a', rel='feed')):
        feed_url = rel_feed_node.get('href')
        if not feed_url:
            continue

        feed_url = urlparse.urljoin(author_url, feed_url)
        feed_type = rel_feed_node.get('type')
        if feed_type and feed_type != 'text/html':
            feed_ok = False
        else:
            # double check that it's text/html, not too big, etc
            feed_url, _, feed_ok = util.get_webmention_target(feed_url)

        if feed_url == author_url:
            logging.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logging.debug('skipping feed of type %s', feed_type)
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logging.debug("fetching author's rel-feed %s", feed_url)
            feed_resp = util.requests_get(feed_url)
            feed_resp.raise_for_status()
            logging.debug("author's rel-feed fetched successfully %s",
                          feed_url)
            feeditems = _merge_hfeeds(
                feeditems, _find_feed_items(feed_url, feed_resp.text))

            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logging.info(
                        'rel-feed found new domain %s! adding to source',
                        domain)
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logging.info('Could not fetch h-feed url %s.',
                         feed_url,
                         exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published')

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logging.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, basestring):
                    permalink_to_entry[permalink] = child
                else:
                    logging.warn('unexpected non-string "url" property: %s',
                                 permalink)

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logging.info('Hit cap of %d permalinks. Stopping.', max)
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.iteritems():
        logging.debug('processing permalink: %s', permalink)
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.iteritems():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
Ejemplo n.º 39
0
  def test_refetch_hfeed(self):
    """refetch should grab resources again, even if they were previously
    marked with a blank SyndicatedPost
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    # refetch 1 and 3 to see if they've been updated, 2 has already
    # been resolved for this source
    SyndicatedPost(parent=source.key,
                   original='http://author/permalink1',
                   syndication=None).put()

    SyndicatedPost(parent=source.key,
                   original='http://author/permalink2',
                   syndication='https://fa.ke/post/url2').put()

    SyndicatedPost(parent=source.key,
                   original='http://author/permalink3',
                   syndication=None).put()

    self.expect_requests_get('http://author', """
      <html class="h-feed">
        <a class="h-entry" href="/permalink1"></a>
        <a class="h-entry" href="/permalink2"></a>
        <a class="h-entry" href="/permalink3"></a>
      </html>""")

    # yay, permalink1 has an updated syndication url
    self.expect_requests_get('http://author/permalink1', """
      <html class="h-entry">
        <a class="u-url" href="/permalink1"></a>
        <a class="u-syndication" href="https://fa.ke/post/url1"></a>
      </html>""").InAnyOrder()

    # permalink3 hasn't changed since we first checked it
    self.expect_requests_get('http://author/permalink3', """
      <html class="h-entry">
        <a class="u-url" href="/permalink3"></a>
      </html>""").InAnyOrder()

    self.mox.ReplayAll()
    original_post_discovery.refetch(source)

    relationships1 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink1',
      ancestor=source.key).fetch()

    self.assertTrue(relationships1)
    self.assertEquals('https://fa.ke/post/url1', relationships1[0].syndication)

    relationships2 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink2',
      ancestor=source.key).fetch()

    # this shouldn't have changed
    self.assertTrue(relationships2)
    self.assertEquals('https://fa.ke/post/url2', relationships2[0].syndication)

    relationships3 = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink3',
      ancestor=source.key).fetch()

    self.assertTrue(relationships3)
    self.assertIsNone(relationships3[0].syndication)