Example #1
0
    def test_get_or_save_restart(self):
        source = self.sources[0]
        response = self.responses[0]

        # new. should add one propagate task total.
        saved = response.get_or_save(source, restart=True)
        self.assert_propagate_task()

        # existing. should add one more propagate task.
        saved.get_or_save(source, restart=True)
        self.assert_propagate_task()

        # new syndication URL. should add two propagate tasks.
        synd = source.canonicalize_url(self.activities[0]['url'])
        SyndicatedPost(parent=source.key,
                       original='http://or/ig',
                       syndication=synd).put()
        SyndicatedPost(
            parent=source.key, original=None,
            syndication=synd).put()  # check that we don't die on blanks

        final = response.get_or_save(source, restart=True)
        self.assert_propagate_task()
        self.assert_equals(['http://or/ig', 'http://target1/post/url'],
                           final.unsent)

        # no activity URLs. should skip SyndicatedPost query.
        response.activities_json = []
        response.put()
        response.get_or_save(source, restart=True)
        self.assert_propagate_task()
Example #2
0
    def setUp(self):
        super().setUp()

        self.source = FakeSource.new()
        self.source.put()

        self.relationships = []
        self.relationships.append(
            SyndicatedPost(parent=self.source.key,
                           original='http://original/post/url',
                           syndication='http://silo/post/url'))
        # two syndication for the same original
        self.relationships.append(
            SyndicatedPost(parent=self.source.key,
                           original='http://original/post/url',
                           syndication='http://silo/another/url'))
        # two originals for the same syndication
        self.relationships.append(
            SyndicatedPost(parent=self.source.key,
                           original='http://original/another/post',
                           syndication='http://silo/post/url'))
        self.relationships.append(
            SyndicatedPost(parent=self.source.key,
                           original=None,
                           syndication='http://silo/no-original'))
        self.relationships.append(
            SyndicatedPost(parent=self.source.key,
                           original='http://original/no-syndication',
                           syndication=None))

        for r in self.relationships:
            r.put()
  def test_syndication_url_in_hfeed(self):
    """Like test_single_post, but because the syndication URL is given in
    the h-feed we skip fetching the permalink. New behavior as of
    2014-11-08
    """
    self.activity['object']['upstreamDuplicates'] = ['existing uD']

    # silo domain is fa.ke
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/post/url">
      </div>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', self.source, self.activity)
    original_post_discovery.discover(self.source, self.activity)

    # upstreamDuplicates = 1 original + 1 discovered
    self.assertEquals(['existing uD', 'http://author/post/permalink'],
                      self.activity['object']['upstreamDuplicates'])

    origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)]
    self.assertEquals([u'http://author/post/permalink'], origurls)

    # for now only syndicated posts belonging to this source are stored
    syndurls = list(r.syndication for r
                    in SyndicatedPost.query(ancestor=self.source.key))

    self.assertEquals([u'https://fa.ke/post/url'], syndurls)
  def test_multiple_refetches(self):
    """Ensure that multiple refetches of the same post (with and without
    u-syndication) does not generate duplicate blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for details
    """
    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""

    unsyndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    </html>"""

    syndicated = """<html class="h-entry">
    <a class="u-url" href="/permalink"></a>
    <a class="u-syndication" href="https://fa.ke/post/url"></a>
    </html>"""

    # first attempt, no syndication url yet
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # refetch, still no syndication url
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', unsyndicated)

    # second refetch, has a syndication url this time
    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', syndicated)

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activities[0])
    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertIsNone(relations[0].syndication)

    original_post_discovery.refetch(self.source)

    relations = list(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://author/permalink',
        ancestor=self.source.key).fetch())

    self.assertEquals(1, len(relations))
    self.assertEquals('http://author/permalink', relations[0].original)
    self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
  def test_multiple_rel_feeds(self):
    """Make sure that we follow all rel=feed links, e.g. if notes and
    articles are in separate feeds."""

    self.expect_requests_get('http://author', """
    <html>
      <head>
        <link rel="feed" href="/articles" type="text/html">
        <link rel="feed" href="/notes" type="text/html">
      </head>
    </html>""")

    # fetches all feeds first
    self.expect_requests_get('http://author/articles', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/article-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    self.expect_requests_get('http://author/notes', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="/note-permalink"></a>
      </article>
    </html>""").InAnyOrder('feed')

    # then the permalinks (in any order since they are hashed to
    # remove duplicates)
    self.expect_requests_get('http://author/article-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/article-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/article"></a>
    </html>""").InAnyOrder('permalink')

    self.expect_requests_get('http://author/note-permalink', """
    <html class="h-entry">
      <a class="u-url" href="/note-permalink"></a>
      <a class="u-syndication" href="https://fa.ke/note"></a>
    </html>""").InAnyOrder('permalink')

    self.mox.ReplayAll()
    original_post_discovery.discover(self.source, self.activity)

    note_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/note-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(note_rels))
    self.assertEqual('https://fa.ke/note', note_rels[0].syndication)

    article_rels = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/article-permalink',
      ancestor=self.source.key).fetch()

    self.assertEqual(1, len(article_rels))
    self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
Example #6
0
  def test_query_by_original_url(self):
    """Simply testing the query helper"""
    r = SyndicatedPost.query_by_original(
        self.source, 'http://original/post/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://silo/post/url', r.syndication)

    r = SyndicatedPost.query_by_original(
        self.source, 'http://original/no-syndication')
    self.assertIsNotNone(r)
    self.assertIsNone(r.syndication)
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
    """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
    logging.info('starting posse post discovery with syndicated %s',
                 syndication_url)

    relationships = SyndicatedPost.query(
        SyndicatedPost.syndication == syndication_url,
        ancestor=source.key).fetch()

    if not relationships and fetch_hfeed:
        # a syndicated post we haven't seen before! fetch the author's URLs to see
        # if we can find it.
        #
        # TODO: Consider using the actor's url, with get_author_urls() as the
        # fallback in the future to support content from non-Bridgy users.
        results = {}
        for url in _get_author_urls(source):
            if url not in already_fetched_hfeeds:
                results.update(_process_author(source, url))
                already_fetched_hfeeds.add(url)
            else:
                logging.debug('skipping %s, already fetched this round', url)

        relationships = results.get(syndication_url, [])

    if not relationships:
        # No relationships were found. Remember that we've seen this
        # syndicated post to avoid reprocessing it every time
        logging.debug('posse post discovery found no relationship for %s',
                      syndication_url)
        if fetch_hfeed:
            SyndicatedPost.insert_syndication_blank(source, syndication_url)

    originals = [r.original for r in relationships if r.original]
    if originals:
        logging.debug('posse post discovery found relationship(s) %s -> %s',
                      syndication_url, originals)
    return originals
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
  logging.info('starting posse post discovery with syndicated %s',
               syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()

  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # TODO: Consider using the actor's url, with get_author_urls() as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in _get_author_urls(source):
      if url not in already_fetched_hfeeds:
        results.update(_process_author(source, url))
        already_fetched_hfeeds.add(url)
      else:
        logging.debug('skipping %s, already fetched this round', url)

    relationships = results.get(syndication_url, [])

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)

  originals = [r.original for r in relationships if r.original]
  if originals:
    logging.debug('posse post discovery found relationship(s) %s -> %s',
                  syndication_url, originals)
  return originals
Example #9
0
    def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
        """Make sure we don't insert duplicate blank entries"""

        SyndicatedPost.insert_syndication_blank(self.source,
                                                'http://silo/no-original')

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([None], [rel.original for rel in rs])
Example #10
0
  def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
    """Make sure we don't insert duplicate blank entries"""

    SyndicatedPost.insert_syndication_blank(
      self.source, 'http://silo/no-original')

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual([None], [rel.original for rel in rs])
  def test_refetch_multiple_responses_same_activity(self):
    """Ensure that refetching a post that has several replies does not
    generate duplicate original -> None blank entries in the
    database. See https://github.com/snarfed/bridgy/issues/259 for
    details
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    for activity in self.activities:
        activity['object']['content'] = 'post content without backlinks'
        activity['object']['url'] = 'https://fa.ke/post/url'

    author_feed = """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>"""

    author_entry = """
    <html class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </html>"""

    # original
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    # refetch
    self.expect_requests_get('http://author', author_feed)
    self.expect_requests_get('http://author/post/permalink', author_entry)
    self.mox.ReplayAll()

    for activity in self.activities:
      original_post_discovery.discover(source, activity)

    original_post_discovery.refetch(source)

    rels_by_original = list(
      SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_original))
    self.assertIsNone(rels_by_original[0].syndication)

    rels_by_syndication = list(
      SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url',
                           ancestor=source.key).fetch())

    self.assertEquals(1, len(rels_by_syndication))
    self.assertIsNone(rels_by_syndication[0].original)
    def test_single_post(self):
        """Test that original post discovery does the reverse lookup to scan
    author's h-feed for rel=syndication links
    """
        activity = self.activities[0]
        activity['object'].update({
            'content': 'post content without backlink',
            'url': 'http://fa.ke/post/url',
            'upstreamDuplicates': ['existing uD'],
        })

        # silo domain is fa.ke
        source = self.sources[0]
        source.domain_urls = ['http://author']

        self.expect_requests_get(
            'http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/post/permalink"></a>
      </div>
    </html>""")

        # syndicated to two places
        self.expect_requests_get(
            'http://author/post/permalink', """
    <link rel="syndication" href="http://not.real/statuses/postid">
    <link rel="syndication" href="http://fa.ke/post/url">
    <div class="h-entry">
      <a class="u-url" href="http://author/post/permalink"></a>
    </div>""")

        self.mox.ReplayAll()
        logging.debug('Original post discovery %s -> %s', source, activity)
        original_post_discovery.discover(source, activity)

        # upstreamDuplicates = 1 original + 1 discovered
        self.assertEquals(['existing uD', 'http://author/post/permalink'],
                          activity['object']['upstreamDuplicates'])

        origurls = [
            r.original for r in SyndicatedPost.query(ancestor=source.key)
        ]
        self.assertEquals([u'http://author/post/permalink'], origurls)

        # for now only syndicated posts belonging to this source are stored
        syndurls = list(
            r.syndication for r in SyndicatedPost.query(ancestor=source.key))

        self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: models.Source subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info('starting posse post discovery with syndicated %s', syndication_url)
  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # Use source.domain_urls for now; it seems more reliable than the
    # activity.actor.url (which depends on getting the right data back from
    # various APIs). Consider using the actor's url, with domain_urls as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in source.get_author_urls():
      results.update(_process_author(source, url))
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(unicode(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity
Example #14
0
    def test_insert_no_duplicates(self):
        """Make sure we don't insert duplicate entries"""

        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/post/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/post/url', r.original)

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            SyndicatedPost.original == 'http://original/post/url',
            ancestor=self.source.key).fetch()

        self.assertEqual(1, len(rs))
def _posse_post_discovery(source, activity, author_url, syndication_url,
                          fetch_hfeed):
  """Performs the actual meat of the posse-post-discover. It was split
  out from discover() so that it can be done inside of a transaction.

  Args:
    source: models.Source subclass
    activity: activity dict
    author_url: author's url configured in their silo profile
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info(
      'starting posse post discovery with author %s and syndicated %s',
      author_url, syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's
    # h-feed to see if we can find it.
    results = _process_author(source, author_url)
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(str(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity
Example #16
0
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
Example #17
0
  def test_insert_no_duplicates(self):
    """Make sure we don't insert duplicate entries"""

    r = SyndicatedPost.insert(
      self.source, 'http://silo/post/url', 'http://original/post/url')
    self.assertIsNotNone(r)
    self.assertEqual('http://original/post/url', r.original)

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
      SyndicatedPost.syndication == 'http://silo/post/url',
      SyndicatedPost.original == 'http://original/post/url',
      ancestor=self.source.key
    ).fetch()

    self.assertEqual(1, len(rs))
Example #18
0
    def test_discover_url_site_post_syndication_links(self):
        self.expect_requests_get(
            'http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")

        self.expect_task('discover', source_key=self.source, post_id='222')
        self.expect_task('discover', source_key=self.source, post_id='444')
        self.mox.ReplayAll()

        self.assertEqual(0, SyndicatedPost.query().count())
        self.check_discover(
            'http://si.te/123',
            'Discovering now. Refresh in a minute to see the results!')

        self.assertCountEqual([
            {
                'https://fa.ke/222': 'http://si.te/123'
            },
            {
                'https://fa.ke/post/444': 'http://si.te/123'
            },
        ], [{
            sp.syndication: sp.original
        } for sp in models.SyndicatedPost.query()])

        now = util.now_fn()
        source = self.source.key.get()
        self.assertEqual(now, source.last_syndication_url)
def _process_syndication_urls(source, permalink, syndication_urls):
  """Process a list of syndication URLs looking for one that matches the
  current source.  If one is found, stores a new SyndicatedPost in the
  db.

  Args:
    source: a models.Source subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication_urls
  """

  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for syndication_url in syndication_urls:
    # follow redirects to give us the canonical syndication url --
    # gives the best chance of finding a match.
    syndication_url = util.follow_redirects(syndication_url).url
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    syndication_url = source.canonicalize_syndication_url(syndication_url)
    # check that the syndicated url belongs to this source TODO save future
    # lookups by saving results for other sources too (note: query the
    # appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN:
      logging.debug('saving discovered relationship %s -> %s',
                    syndication_url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=syndication_url, original=permalink)
      results.setdefault(syndication_url, []).append(relationship)
  return results
 def test_do_not_fetch_hfeed(self):
   """Confirms behavior of discover() when fetch_hfeed=False.
   Discovery should only check the database for previously discovered matches.
   It should not make any GET requests
   """
   discover(self.source, self.activity, fetch_hfeed=False)
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
  def test_no_h_entries(self):
    """Make sure nothing bad happens when fetching a feed without
    h-entries
    """
    activity = self.activities[0]
    activity['object']['content'] = 'post content without backlink'
    activity['object']['url'] = 'https://fa.ke/post/url'

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <html class="h-feed">
    <p>under construction</p>
    </html>""")

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    self.assert_equals(
      [(None, 'https://fa.ke/post/url')],
      [(relationship.original, relationship.syndication)
       for relationship in SyndicatedPost.query(ancestor=source.key)])
  def _test_failed_post_permalink_fetch(self, raise_exception):
    """Make sure something reasonable happens when we're unable to fetch
    the permalink of an entry linked in the h-feed
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <article class="h-entry">
        <a class="u-url" href="nonexistent.html"></a>
      </article>
    </html>
    """)

    if raise_exception:
      self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError())
    else:
      self.expect_requests_get('http://author/nonexistent.html', status_code=410)

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # we should have saved placeholders to prevent us from trying the
    # syndication url or permalink again
    self.assert_equals(
      set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]),
      set((relationship.original, relationship.syndication)
          for relationship in SyndicatedPost.query(ancestor=source.key)))
Example #23
0
  def test_discover_url_site_post_syndication_links(self):
    self.expect_requests_get('http://si.te/123', """
<div class="h-entry">
  foo
  <a class="u-syndication" href="http://fa.ke/222"></a>
  <a class="u-syndication" href="http://other/silo"></a>
  <a class="u-syndication" href="http://fa.ke/post/444"></a>
</div>""")
    self.mox.ReplayAll()

    self.assertEqual(0, SyndicatedPost.query().count())
    self.check_discover('http://si.te/123',
        'Discovering now. Refresh in a minute to see the results!')

    self.assertItemsEqual([
      {'https://fa.ke/222': 'http://si.te/123'},
      {'https://fa.ke/post/444': 'http://si.te/123'},
      ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()])

    tasks = self.taskqueue_stub.GetTasks('discover')
    key = self.source.key.urlsafe()
    self.assertEqual([
      {'source_key': key, 'post_id': '222'},
      {'source_key': key, 'post_id': '444'},
    ], [testutil.get_task_params(task) for task in tasks])

    now = util.now_fn()
    source = self.source.key.get()
    self.assertEqual(now, source.last_syndication_url)
Example #24
0
  def test_get_or_insert_by_syndication_replace(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)
  def test_refetch_unchanged_syndication(self):
    """We should preserve unchanged SyndicatedPosts during refetches."""
    synd = SyndicatedPost(parent=self.source.key,
                          original='http://author/permalink',
                          syndication='https://fa.ke/post/url')
    synd.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    refetch(self.source)
    self.assert_entities_equal([synd], list(SyndicatedPost.query()))
 def test_no_author_url(self):
   """Make sure something reasonable happens when the author doesn't have
   a url at all.
   """
   self.source.domain_urls = []
   discover(self.source, self.activity)
   # nothing attempted, and no SyndicatedPost saved
   self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
Example #27
0
  def test_get_or_save_restart_existing_new_synd_url(self):
    source = self.sources[0]
    response = self.responses[0]
    response.put()

    # new syndication URL. should add two unsent URLs.
    synd = source.canonicalize_url(self.activities[0]['url'])
    SyndicatedPost(parent=source.key, original='http://or/ig',
                   syndication=synd).put()
    SyndicatedPost(parent=source.key, original=None,
                   syndication=synd).put()  # check that we don't die on blanks

    self.expect_task('propagate', response_key=response)
    self.mox.ReplayAll()

    final = response.get_or_save(source, restart=True)
    self.assert_equals(['http://or/ig', 'http://target1/post/url'], final.unsent)
  def test_merge_front_page_and_h_feed(self):
    """Make sure we are correctly merging the front page and rel-feed by
    checking that we visit h-entries that are only the front page or
    only the rel-feed page.
    """
    activity = self.activities[0]
    activity['object'].update({
        'content': 'post content without backlink',
        'url': 'https://fa.ke/post/url',
        'upstreamDuplicates': ['existing uD'],
    })

    # silo domain is fa.ke
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.expect_requests_get('http://author', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-frontpage"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
    </html>""")

    self.expect_requests_get('http://author/feed', """
    <link rel="feed" href="/feed">
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="http://author/on-both"></a>
      </div>
      <div class="h-entry">
        <a class="u-url" href="http://author/only-on-feed"></a>
      </div>
    </html>""")

    for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'):
      self.expect_requests_get('http://author%s' % orig,
                               """<div class="h-entry">
                                 <a class="u-url" href="%s"></a>
                               </div>""" % orig).InAnyOrder()

    self.mox.ReplayAll()
    logging.debug('Original post discovery %s -> %s', source, activity)
    original_post_discovery.discover(source, activity)

    # should be three blank SyndicatedPosts now
    for orig in ('http://author/only-on-frontpage',
                 'http://author/on-both',
                 'http://author/only-on-feed'):
      logging.debug('checking %s', orig)
      sp = SyndicatedPost.query(
        SyndicatedPost.original == orig,
        ancestor=source.key).get()
      self.assertTrue(sp)
      self.assertIsNone(sp.syndication)
Example #29
0
    def test_insert_auguments_existing(self):
        """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
        r = SyndicatedPost.insert(self.source, 'http://silo/post/url',
                                  'http://original/different/url')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/different/url', r.original)

        # make sure they're both in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/post/url',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([
            'http://original/post/url', 'http://original/another/post',
            'http://original/different/url'
        ], [rel.original for rel in rs])
Example #30
0
  def test_get_or_insert_by_syndication_do_not_replace(self):
    """Make sure we don't replace original=something with
    original=something else (in practice, that would mean another task
    is running discovery concurrently and found a different url)
    """
    r = SyndicatedPost.get_or_insert_by_syndication_url(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/post/url', r.original)

    # make sure it's unchanged in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/post/url', rs[0].original)
    self.assertEquals('http://silo/post/url', rs[0].syndication)
  def test_refetch_blank_syndication(self):
    """We should preserve blank SyndicatedPosts during refetches."""
    blank = SyndicatedPost(parent=self.source.key,
                           original='http://author/permalink',
                           syndication=None)
    blank.put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </div>
    </html>""")
    self.expect_requests_get('http://author/permalink', """
      <html class="h-entry">
        <a class="u-url" href="/permalink"></a>
      </html>""")

    self.mox.ReplayAll()
    self.assert_equals({}, refetch(self.source))
    self.assert_syndicated_posts(('http://author/permalink', None))
Example #32
0
  def test_insert_auguments_existing(self):
    """Make sure we add newly discovered urls for a given syndication url,
    rather than overwrite them
    """
    r = SyndicatedPost.insert(
        self.source, 'http://silo/post/url',
        'http://original/different/url')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/different/url', r.original)

    # make sure they're both in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/post/url',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual(['http://original/post/url',
                           'http://original/another/post',
                           'http://original/different/url'],
                          [rel.original for rel in rs])
  def test_no_author_url(self):
    """Make sure something reasonable happens when the author doesn't have
    a url at all.
    """
    source = self.sources[0]
    source.domain_urls = []
    activity = self.activities[0]
    activity['object']['url'] = 'https://fa.ke/post/url'
    activity['object']['content'] = 'content without links'

    self.mox.ReplayAll()
    original_post_discovery.discover(source, activity)

    # nothing attempted, and no SyndicatedPost saved
    self.assertFalse(SyndicatedPost.query(ancestor=source.key).get())
Example #34
0
  def test_retry(self):
    self.assertEqual([], self.taskqueue_stub.GetTasks('propagate'))

    source = self.sources[0]
    source.domain_urls = ['http://orig']
    source.last_hfeed_refetch = last_hfeed_refetch = \
        testutil.NOW - datetime.timedelta(minutes=1)
    source.put()

    resp = self.responses[0]
    resp.status = 'complete'
    resp.unsent = ['http://unsent']
    resp.sent = ['http://sent']
    resp.error = ['http://error']
    resp.failed = ['http://failed']
    resp.skipped = ['https://skipped']

    # SyndicatedPost with new target URLs
    resp.activities_json = [
      json.dumps({'object': {'url': 'https://fa.ke/1'}}),
      json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}),
      json.dumps({'url': 'https://fa.ke/3'}),
    ]
    resp.put()
    SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
    SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
    SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

    # cached webmention endpoint
    memcache.set('W https skipped /', 'asdf')

    key = resp.key.urlsafe()
    response = app.application.get_response(
      '/retry', method='POST', body=native_str(urllib.parse.urlencode({'key': key})))
    self.assertEquals(302, response.status_int)
    self.assertEquals(source.bridgy_url(self.handler),
                      response.headers['Location'].split('#')[0])
    params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0])
    self.assertEqual(key, params['response_key'])

    # status and URLs should be refreshed
    got = resp.key.get()
    self.assertEqual('new', got.status)
    self.assertItemsEqual(
      ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/',
       'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'],
      got.unsent)
    for field in got.sent, got.skipped, got.error, got.failed:
      self.assertEqual([], field)

    # webmention endpoints for URL domains should be refreshed
    self.assertIsNone(memcache.get('W https skipped /'))

    # shouldn't have refetched h-feed
    self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
Example #35
0
    def test_insert_replaces_blanks(self):
        """Make sure we replace original=None with original=something
    when it is discovered"""

        # add a blank for the original too
        SyndicatedPost.insert_original_blank(
            self.source, 'http://original/newly-discovered')

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertTrue(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())

        r = SyndicatedPost.insert(self.source, 'http://silo/no-original',
                                  'http://original/newly-discovered')
        self.assertIsNotNone(r)
        self.assertEqual('http://original/newly-discovered', r.original)

        # make sure it's in NDB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()
        self.assertEqual(1, len(rs))
        self.assertEqual('http://original/newly-discovered', rs[0].original)
        self.assertEqual('http://silo/no-original', rs[0].syndication)

        # and the blanks have been removed
        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.syndication == 'http://silo/no-original',
                SyndicatedPost.original == None,
                ancestor=self.source.key).get())

        self.assertFalse(
            SyndicatedPost.query(
                SyndicatedPost.original == 'http://original/newly-discovered',
                SyndicatedPost.syndication == None,
                ancestor=self.source.key).get())
Example #36
0
  def test_retry(self):
    self.assertEqual([], self.taskqueue_stub.GetTasks('propagate'))

    source = self.sources[0]
    source.domain_urls = ['http://orig']
    source.last_hfeed_refetch = last_hfeed_refetch = \
        testutil.NOW - datetime.timedelta(minutes=1)
    source.put()

    resp = self.responses[0]
    resp.status = 'complete'
    resp.unsent = ['http://unsent']
    resp.sent = ['http://sent']
    resp.error = ['http://error']
    resp.failed = ['http://failed']
    resp.skipped = ['https://skipped']

    # SyndicatedPost with new target URLs
    resp.activities_json = [
      json.dumps({'object': {'url': 'https://fa.ke/1'}}),
      json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}),
      json.dumps({'url': 'https://fa.ke/3'}),
    ]
    resp.put()
    SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
    SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
    SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

    # cached webmention endpoint
    memcache.set('W https skipped /', 'asdf')

    key = resp.key.urlsafe()
    response = app.application.get_response(
      '/retry', method='POST', body=urllib.urlencode({'key': key}))
    self.assertEquals(302, response.status_int)
    self.assertEquals(source.bridgy_url(self.handler),
                      response.headers['Location'].split('#')[0])
    params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0])
    self.assertEqual(key, params['response_key'])

    # status and URLs should be refreshed
    got = resp.key.get()
    self.assertEqual('new', got.status)
    self.assertItemsEqual(
      ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/',
       'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'],
      got.unsent)
    for field in got.sent, got.skipped, got.error, got.failed:
      self.assertEqual([], field)

    # webmention endpoints for URL domains should be refreshed
    self.assertIsNone(memcache.get('W https skipped /'))

    # shouldn't have refetched h-feed
    self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
  def test_refetch_two_permalinks_same_syndication(self):
    """
    This causes a problem if refetch assumes that syndication-url is
    unique under a given source.
    """
    source = self.sources[0]
    source.domain_urls = ['http://author']

    self.activities[0]['object'].update({
      'content': 'post content without backlinks',
      'url': 'https://fa.ke/post/url',
    })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/post1"></a>
    <a class="h-entry" href="/post2"></a>
    </html>"""

    self.expect_requests_get('http://author', hfeed)

    for i in range(2):
      self.expect_requests_get(
        'http://author/post%d' % (i + 1),
        """<html class="h-entry">
        <a class="u-url" href="/post%d"></a>
        <a class="u-syndication" href="https://fa.ke/post/url"></a>
        </html>""" % (i + 1))

    # refetch should only grab the feed
    self.expect_requests_get('http://author', hfeed)

    self.mox.ReplayAll()
    activity = original_post_discovery.discover(source, self.activities[0])
    self.assertItemsEqual(['http://author/post1', 'http://author/post2'],
                          activity['object'].get('upstreamDuplicates'))

    relations = SyndicatedPost.query(ancestor=source.key).fetch()
    self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'),
                           ('http://author/post2', 'https://fa.ke/post/url')],
                          [(relation.original, relation.syndication)
                           for relation in relations])

    # discover should have already handled all relationships, refetch should
    # not find anything
    refetch_result = original_post_discovery.refetch(source)
    self.assertFalse(refetch_result)
Example #38
0
  def test_insert_replaces_blanks(self):
    """Make sure we replace original=None with original=something
    when it is discovered"""

    # add a blank for the original too
    SyndicatedPost.insert_original_blank(
      self.source, 'http://original/newly-discovered')

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertTrue(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())

    r = SyndicatedPost.insert(
        self.source, 'http://silo/no-original',
        'http://original/newly-discovered')
    self.assertIsNotNone(r)
    self.assertEquals('http://original/newly-discovered', r.original)

    # make sure it's in NDB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()
    self.assertEquals(1, len(rs))
    self.assertEquals('http://original/newly-discovered', rs[0].original)
    self.assertEquals('http://silo/no-original', rs[0].syndication)

    # and the blanks have been removed
    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        SyndicatedPost.original == None, ancestor=self.source.key).get())

    self.assertFalse(
      SyndicatedPost.query(
        SyndicatedPost.original == 'http://original/newly-discovered',
        SyndicatedPost.syndication == None, ancestor=self.source.key).get())
  def test_refetch_permalink_with_two_syndications(self):
    """Test one permalink with two syndicated posts. Make sure that
    refetch doesn't have a problem with two entries for the same
    original URL.
    """
    for idx, activity in enumerate(self.activities):
      activity['object'].update({
        'content': 'post content without backlinks',
        'url': 'https://fa.ke/post/url%d' % (idx + 1),
      })

    hfeed = """<html class="h-feed">
    <a class="h-entry" href="/permalink"></a>
    </html>"""
    hentry = """<html class="h-entry">
    <a class="u-url" href="/permalink"/>
    <a class="u-syndication" href="https://fa.ke/post/url1"/>
    <a class="u-syndication" href="https://fa.ke/post/url3"/>
    <a class="u-syndication" href="https://fa.ke/post/url5"/>
    </html>"""

    self.expect_requests_get('http://author', hfeed)
    self.expect_requests_get('http://author/permalink', hentry)

    # refetch
    self.expect_requests_get('http://author', hfeed)
    # refetch grabs posts that it's seen before in case there have
    # been updates
    self.expect_requests_get('http://author/permalink', hentry)

    self.mox.ReplayAll()

    original_post_discovery.discover(self.source, self.activities[0])
    relations = SyndicatedPost.query(
      SyndicatedPost.original == 'http://author/permalink',
      ancestor=self.source.key).fetch()
    self.assertItemsEqual(
      [('http://author/permalink', 'https://fa.ke/post/url1'),
       ('http://author/permalink', 'https://fa.ke/post/url3'),
       ('http://author/permalink', 'https://fa.ke/post/url5')],
      [(r.original, r.syndication) for r in relations])

    results = original_post_discovery.refetch(self.source)
    self.assertFalse(results)
Example #40
0
def _process_syndication_urls(source, permalink, syndication_urls,
                              preexisting):
    """Process a list of syndication URLs looking for one that matches the
  current source. If one is found, stores a new :class:`models.SyndicatedPost`
  in the db.

  Args:
    source: a :class:`models.Source` subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication urls
    preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s

  Returns:
    dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s
  """
    results = {}
    # save the results (or lack thereof) to the db, and put them in a
    # map for immediate use
    for url in syndication_urls:
        # source-specific logic to standardize the URL. (e.g., replace facebook
        # username with numeric id)
        url = source.canonicalize_url(url)
        if not url:
            continue

        # TODO: save future lookups by saving results for other sources too (note:
        # query the appropriate source subclass by author.domains, rather than
        # author.domain_urls)
        #
        # we may have already seen this relationship, save a DB lookup by
        # finding it in the preexisting list
        relationship = next(
            (sp for sp in preexisting
             if sp.syndication == url and sp.original == permalink), None)
        if not relationship:
            logging.debug('saving discovered relationship %s -> %s', url,
                          permalink)
            relationship = SyndicatedPost.insert(source,
                                                 syndication=url,
                                                 original=permalink)
        results.setdefault(url, []).append(relationship)

    return results
  def test_refetch_changed_syndication(self):
    """Update syndication links that have changed since our last fetch."""
    SyndicatedPost(parent=self.source.key,
                   original='http://author/permalink',
                   syndication='https://fa.ke/post/url').put()
    self.expect_requests_get('http://author', """
    <html class="h-feed">
      <div class="h-entry">
        <a class="u-url" href="/permalink"></a>
        <a class="u-syndication" href="http://fa.ke/changed/url"></a>
      </div>
    </html>""")

    self.mox.ReplayAll()
    results = refetch(self.source)
    self.assert_syndicated_posts(
      ('http://author/permalink', 'https://fa.ke/changed/url'))
    self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())},
                       results)
Example #42
0
def _process_syndication_urls(source, permalink, syndication_urls,
                              preexisting):
  """Process a list of syndication URLs looking for one that matches the
  current source. If one is found, stores a new :class:`models.SyndicatedPost`
  in the db.

  Args:
    source: a :class:`models.Source` subclass
    permalink: a string. the current h-entry permalink
    syndication_urls: a collection of strings. the unfitered list
      of syndication urls
    preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s

  Returns:
    dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s
  """
  results = {}
  # save the results (or lack thereof) to the db, and put them in a
  # map for immediate use
  for url in syndication_urls:
    # source-specific logic to standardize the URL. (e.g., replace facebook
    # username with numeric id)
    url = source.canonicalize_url(url)
    if not url:
      continue

    # TODO: save future lookups by saving results for other sources too (note:
    # query the appropriate source subclass by author.domains, rather than
    # author.domain_urls)
    #
    # we may have already seen this relationship, save a DB lookup by
    # finding it in the preexisting list
    relationship = next((sp for sp in preexisting
                         if sp.syndication == url
                         and sp.original == permalink), None)
    if not relationship:
      logging.debug('saving discovered relationship %s -> %s', url, permalink)
      relationship = SyndicatedPost.insert(
        source, syndication=url, original=permalink)
    results.setdefault(url, []).append(relationship)

  return results
Example #43
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    try:
        logging.debug('fetching author url %s', author_url)
        author_resp = util.requests_get(author_url)
        # TODO for error codes that indicate a temporary error, should we make
        # a certain number of retries before giving up forever?
        author_resp.raise_for_status()
        author_dom = util.beautifulsoup_parse(author_resp.text)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logging.info('Could not fetch author url %s',
                     author_url,
                     exc_info=True)
        return {}

    feeditems = _find_feed_items(author_url, author_dom)

    # look for all other feed urls using rel='feed', type='text/html'
    feed_urls = set()
    for rel_feed_node in (author_dom.find_all('link', rel='feed') +
                          author_dom.find_all('a', rel='feed')):
        feed_url = rel_feed_node.get('href')
        if not feed_url:
            continue

        feed_url = urlparse.urljoin(author_url, feed_url)
        feed_type = rel_feed_node.get('type')
        if feed_type and feed_type != 'text/html':
            feed_ok = False
        else:
            # double check that it's text/html, not too big, etc
            feed_url, _, feed_ok = util.get_webmention_target(feed_url)

        if feed_url == author_url:
            logging.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logging.debug('skipping feed of type %s', feed_type)
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logging.debug("fetching author's rel-feed %s", feed_url)
            feed_resp = util.requests_get(feed_url)
            feed_resp.raise_for_status()
            logging.debug("author's rel-feed fetched successfully %s",
                          feed_url)
            feeditems = _merge_hfeeds(
                feeditems, _find_feed_items(feed_url, feed_resp.text))

            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logging.info(
                        'rel-feed found new domain %s! adding to source',
                        domain)
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logging.info('Could not fetch h-feed url %s.',
                         feed_url,
                         exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published')

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logging.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, basestring):
                    permalink_to_entry[permalink] = child
                else:
                    logging.warn('unexpected non-string "url" property: %s',
                                 permalink)

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logging.info('Hit cap of %d permalinks. Stopping.', max)
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.iteritems():
        logging.debug('processing permalink: %s', permalink)
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.iteritems():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
Example #44
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logger.debug(
                f'previously found relationship(s) for original {permalink}: {synds}'
            )

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    usynd_urls = {url for url in usynd if isinstance(url, str)}
    if usynd_urls:
        logger.debug(
            f'u-syndication links on the h-feed h-entry: {usynd_urls}')
    results = _process_syndication_urls(source, permalink, usynd_urls,
                                        preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        mf2 = None
        try:
            if type_ok:
                logger.debug(f'fetching post permalink {permalink}')
                mf2 = util.fetch_mf2(permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logger.info(f'Could not fetch permalink {permalink}',
                        exc_info=True)
            success = False

        if mf2:
            syndication_urls = set()
            relsynd = mf2['rels'].get('syndication', [])
            if relsynd:
                logger.debug(f'rel-syndication links: {relsynd}')
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, str))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in mf2['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logger.debug(f'u-syndication links: {usynd}')
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, str))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = list(itertools.chain(*results.values()))
        for syndpost in preexisting:
            if syndpost.syndication and syndpost not in result_syndposts:
                logger.info(
                    f'deleting relationship that disappeared: {syndpost}')
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logger.debug(
            f'no syndication links from {permalink} to current source {source.label()}.'
        )
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logger.debug(
                f'saving empty relationship so that {permalink} will not be searched again'
            )
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.items():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logger.debug(f'discovered relationships {new_results}')
    return new_results
Example #45
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    logger.debug(f'fetching author url {author_url}')
    try:
        author_mf2 = util.fetch_mf2(author_url)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logger.info(f'Could not fetch author url {author_url}', exc_info=True)
        return {}

    feeditems = _find_feed_items(author_mf2)

    # try rel=feeds and rel=alternates
    feed_urls = set()
    candidates = (author_mf2['rels'].get('feed', []) + [
        a.get('url') for a in author_mf2.get('alternates', [])
        if a.get('type') == MF2_HTML_MIME_TYPE
    ])
    for feed_url in candidates:
        # check that it's html, not too big, etc
        feed_url, _, feed_ok = util.get_webmention_target(feed_url)
        if feed_url == author_url:
            logger.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logger.debug("skipping feed since it's not HTML or otherwise bad")
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logger.debug(f"fetching author's rel-feed {feed_url}")
            feed_mf2 = util.fetch_mf2(feed_url)
            feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2))
            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logger.info(
                        f'rel-feed found new domain {domain}! adding to source'
                    )
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logger.info(f'Could not fetch h-feed url {feed_url}.',
                        exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published') or ''

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logger.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, str):
                    permalink_to_entry[permalink] = child
                else:
                    logger.warning(
                        f'unexpected non-string "url" property: {permalink}')

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logger.info(f'Hit cap of {max} permalinks. Stopping.')
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.items():
        logger.debug(f'processing permalink: {permalink}')
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.items():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
def _process_entry(source,
                   permalink,
                   feed_entry,
                   refetch,
                   preexisting,
                   store_blanks=True):
    """Fetch and process an h-entry, saving a new SyndicatedPost to the
  DB if successful.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: a list of previously discovered models.SyndicatedPosts
      for this permalink
    store_blanks: boolean, whether we should store blank SyndicatedPosts when
      we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new models.SyndicatedPosts
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logging.debug(
                'previously found relationship(s) for original %s: %s',
                permalink, synds)

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    if usynd:
        logging.debug('u-syndication links on the h-feed h-entry: %s', usynd)
    results = _process_syndication_urls(
        source, permalink,
        set(url for url in usynd if isinstance(url, basestring)), preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url:
        # fetch the full permalink page if we think it might have more details
        parsed = None
        try:
            logging.debug('fetching post permalink %s', permalink)
            if type_ok:
                resp = util.requests_get(permalink)
                resp.raise_for_status()
                parsed = util.mf2py_parse(resp.text, permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logging.warning('Could not fetch permalink %s',
                            permalink,
                            exc_info=True)
            success = False

        if parsed:
            syndication_urls = set()
            relsynd = parsed.get('rels').get('syndication', [])
            if relsynd:
                logging.debug('rel-syndication links: %s', relsynd)
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, basestring))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in parsed['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logging.debug('u-syndication links: %s', usynd)
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, basestring))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = itertools.chain(*results.values())
        for syndpost in list(preexisting):
            if syndpost.syndication and syndpost not in result_syndposts:
                logging.info('deleting relationship that disappeared: %s',
                             syndpost)
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logging.debug('no syndication links from %s to current source %s.',
                      permalink, source.label())
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logging.debug(
                'saving empty relationship so that %s will not be '
                'searched again', permalink)
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.iteritems():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logging.debug('discovered relationships %s', new_results)
    return new_results
Example #47
0
    def test_retry(self):
        source = self.sources[0]
        source.domain_urls = ['http://orig']
        source.last_hfeed_refetch = last_hfeed_refetch = testutil.NOW - timedelta(
            minutes=1)
        source.put()

        resp = self.responses[0]
        resp.status = 'complete'
        resp.unsent = ['http://unsent']
        resp.sent = ['http://sent']
        resp.error = ['http://error']
        resp.failed = ['http://failed']
        resp.skipped = ['https://skipped']

        # SyndicatedPost with new target URLs
        resp.activities_json = [
            json_dumps({'object': {
                'url': 'https://fa.ke/1'
            }}),
            json_dumps({
                'url': 'https://fa.ke/2',
                'object': {
                    'unused': 'ok'
                }
            }),
            json_dumps({'url': 'https://fa.ke/3'}),
        ]
        resp.put()
        SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1')
        SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2')
        SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3')

        key = resp.key.urlsafe().decode()
        self.expect_task('propagate', response_key=key)
        self.mox.ReplayAll()

        # cached webmention endpoint
        util.webmention_endpoint_cache['W https skipped /'] = 'asdf'

        response = self.client.post('/retry', data={'key': key})
        self.assertEqual(302, response.status_code)
        self.assertEqual(self.source_bridgy_url, response.headers['Location'])

        # status and URLs should be refreshed
        got = resp.key.get()
        self.assertEqual('new', got.status)
        self.assertCountEqual([
            'http://unsent/', 'http://sent/', 'https://skipped/',
            'http://error/', 'http://failed/', 'https://orig/1',
            'http://orig/2', 'http://orig/3'
        ], got.unsent)
        for field in got.sent, got.skipped, got.error, got.failed:
            self.assertEqual([], field)

        # webmention endpoints for URL domains should be refreshed
        self.assertNotIn('W https skipped /', util.webmention_endpoint_cache)

        # shouldn't have refetched h-feed
        self.assertEqual(last_hfeed_refetch,
                         source.key.get().last_hfeed_refetch)