def test_get_or_save_restart(self): source = self.sources[0] response = self.responses[0] # new. should add one propagate task total. saved = response.get_or_save(source, restart=True) self.assert_propagate_task() # existing. should add one more propagate task. saved.get_or_save(source, restart=True) self.assert_propagate_task() # new syndication URL. should add two propagate tasks. synd = source.canonicalize_url(self.activities[0]['url']) SyndicatedPost(parent=source.key, original='http://or/ig', syndication=synd).put() SyndicatedPost( parent=source.key, original=None, syndication=synd).put() # check that we don't die on blanks final = response.get_or_save(source, restart=True) self.assert_propagate_task() self.assert_equals(['http://or/ig', 'http://target1/post/url'], final.unsent) # no activity URLs. should skip SyndicatedPost query. response.activities_json = [] response.put() response.get_or_save(source, restart=True) self.assert_propagate_task()
def setUp(self): super().setUp() self.source = FakeSource.new() self.source.put() self.relationships = [] self.relationships.append( SyndicatedPost(parent=self.source.key, original='http://original/post/url', syndication='http://silo/post/url')) # two syndication for the same original self.relationships.append( SyndicatedPost(parent=self.source.key, original='http://original/post/url', syndication='http://silo/another/url')) # two originals for the same syndication self.relationships.append( SyndicatedPost(parent=self.source.key, original='http://original/another/post', syndication='http://silo/post/url')) self.relationships.append( SyndicatedPost(parent=self.source.key, original=None, syndication='http://silo/no-original')) self.relationships.append( SyndicatedPost(parent=self.source.key, original='http://original/no-syndication', syndication=None)) for r in self.relationships: r.put()
def test_syndication_url_in_hfeed(self): """Like test_single_post, but because the syndication URL is given in the h-feed we skip fetching the permalink. New behavior as of 2014-11-08 """ self.activity['object']['upstreamDuplicates'] = ['existing uD'] # silo domain is fa.ke self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> <a class="u-syndication" href="http://fa.ke/post/url"> </div> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', self.source, self.activity) original_post_discovery.discover(self.source, self.activity) # upstreamDuplicates = 1 original + 1 discovered self.assertEquals(['existing uD', 'http://author/post/permalink'], self.activity['object']['upstreamDuplicates']) origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)] self.assertEquals([u'http://author/post/permalink'], origurls) # for now only syndicated posts belonging to this source are stored syndurls = list(r.syndication for r in SyndicatedPost.query(ancestor=self.source.key)) self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def test_multiple_refetches(self): """Ensure that multiple refetches of the same post (with and without u-syndication) does not generate duplicate blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" unsyndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""" syndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" # first attempt, no syndication url yet self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # refetch, still no syndication url self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # second refetch, has a syndication url this time self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', syndicated) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertIsNone(relations[0].syndication) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
def test_multiple_rel_feeds(self): """Make sure that we follow all rel=feed links, e.g. if notes and articles are in separate feeds.""" self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" href="/articles" type="text/html"> <link rel="feed" href="/notes" type="text/html"> </head> </html>""") # fetches all feeds first self.expect_requests_get('http://author/articles', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/article-permalink"></a> </article> </html>""").InAnyOrder('feed') self.expect_requests_get('http://author/notes', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/note-permalink"></a> </article> </html>""").InAnyOrder('feed') # then the permalinks (in any order since they are hashed to # remove duplicates) self.expect_requests_get('http://author/article-permalink', """ <html class="h-entry"> <a class="u-url" href="/article-permalink"></a> <a class="u-syndication" href="https://fa.ke/article"></a> </html>""").InAnyOrder('permalink') self.expect_requests_get('http://author/note-permalink', """ <html class="h-entry"> <a class="u-url" href="/note-permalink"></a> <a class="u-syndication" href="https://fa.ke/note"></a> </html>""").InAnyOrder('permalink') self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activity) note_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/note-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(note_rels)) self.assertEqual('https://fa.ke/note', note_rels[0].syndication) article_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/article-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(article_rels)) self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
def test_query_by_original_url(self): """Simply testing the query helper""" r = SyndicatedPost.query_by_original( self.source, 'http://original/post/url') self.assertIsNotNone(r) self.assertEquals('http://silo/post/url', r.syndication) r = SyndicatedPost.query_by_original( self.source, 'http://original/no-syndication') self.assertIsNotNone(r) self.assertIsNone(r.syndication)
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds): """Performs the actual meat of the posse-post-discover. Args: source: :class:`models.Source` subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship already_fetched_hfeeds: set, URLs we've already fetched in a previous iteration Return: sequence of string original post urls, possibly empty """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # TODO: Consider using the actor's url, with get_author_urls() as the # fallback in the future to support content from non-Bridgy users. results = {} for url in _get_author_urls(source): if url not in already_fetched_hfeeds: results.update(_process_author(source, url)) already_fetched_hfeeds.add(url) else: logging.debug('skipping %s, already fetched this round', url) relationships = results.get(syndication_url, []) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) originals = [r.original for r in relationships if r.original] if originals: logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, originals) return originals
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank(self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertCountEqual([None], [rel.original for rel in rs])
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank( self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertItemsEqual([None], [rel.original for rel in rs])
def test_refetch_multiple_responses_same_activity(self): """Ensure that refetching a post that has several replies does not generate duplicate original -> None blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ source = self.sources[0] source.domain_urls = ['http://author'] for activity in self.activities: activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'https://fa.ke/post/url' author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""" author_entry = """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </html>""" # original self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) # refetch self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) self.mox.ReplayAll() for activity in self.activities: original_post_discovery.discover(source, activity) original_post_discovery.refetch(source) rels_by_original = list( SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_original)) self.assertIsNone(rels_by_original[0].syndication) rels_by_syndication = list( SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_syndication)) self.assertIsNone(rels_by_syndication[0].original)
def test_single_post(self): """Test that original post discovery does the reverse lookup to scan author's h-feed for rel=syndication links """ activity = self.activities[0] activity['object'].update({ 'content': 'post content without backlink', 'url': 'http://fa.ke/post/url', 'upstreamDuplicates': ['existing uD'], }) # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get( 'http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""") # syndicated to two places self.expect_requests_get( 'http://author/post/permalink', """ <link rel="syndication" href="http://not.real/statuses/postid"> <link rel="syndication" href="http://fa.ke/post/url"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # upstreamDuplicates = 1 original + 1 discovered self.assertEquals(['existing uD', 'http://author/post/permalink'], activity['object']['upstreamDuplicates']) origurls = [ r.original for r in SyndicatedPost.query(ancestor=source.key) ] self.assertEquals([u'http://author/post/permalink'], origurls) # for now only syndicated posts belonging to this source are stored syndurls = list( r.syndication for r in SyndicatedPost.query(ancestor=source.key)) self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. Args: source: models.Source subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # Use source.domain_urls for now; it seems more reliable than the # activity.actor.url (which depends on getting the right data back from # various APIs). Consider using the actor's url, with domain_urls as the # fallback in the future to support content from non-Bridgy users. results = {} for url in source.get_author_urls(): results.update(_process_author(source, url)) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(unicode(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs))
def _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. It was split out from discover() so that it can be done inside of a transaction. Args: source: models.Source subclass activity: activity dict author_url: author's url configured in their silo profile syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info( 'starting posse post discovery with author %s and syndicated %s', author_url, syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's # h-feed to see if we can find it. results = _process_author(source, author_url) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(str(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get('http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover('http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertItemsEqual([ {'https://fa.ke/222': 'http://si.te/123'}, {'https://fa.ke/post/444': 'http://si.te/123'}, ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()]) tasks = self.taskqueue_stub.GetTasks('discover') key = self.source.key.urlsafe() self.assertEqual([ {'source_key': key, 'post_id': '222'}, {'source_key': key, 'post_id': '444'}, ], [testutil.get_task_params(task) for task in tasks]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key ).fetch() self.assertEqual(1, len(rs))
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get( 'http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.expect_task('discover', source_key=self.source, post_id='222') self.expect_task('discover', source_key=self.source, post_id='444') self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover( 'http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertCountEqual([ { 'https://fa.ke/222': 'http://si.te/123' }, { 'https://fa.ke/post/444': 'http://si.te/123' }, ], [{ sp.syndication: sp.original } for sp in models.SyndicatedPost.query()]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def _process_syndication_urls(source, permalink, syndication_urls): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new SyndicatedPost in the db. Args: source: a models.Source subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication_urls """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.insert( source, syndication=syndication_url, original=permalink) results.setdefault(syndication_url, []).append(relationship) return results
def test_do_not_fetch_hfeed(self): """Confirms behavior of discover() when fetch_hfeed=False. Discovery should only check the database for previously discovered matches. It should not make any GET requests """ discover(self.source, self.activity, fetch_hfeed=False) self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_no_h_entries(self): """Make sure nothing bad happens when fetching a feed without h-entries """ activity = self.activities[0] activity['object']['content'] = 'post content without backlink' activity['object']['url'] = 'https://fa.ke/post/url' # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <html class="h-feed"> <p>under construction</p> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) self.assert_equals( [(None, 'https://fa.ke/post/url')], [(relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)])
def _test_failed_post_permalink_fetch(self, raise_exception): """Make sure something reasonable happens when we're unable to fetch the permalink of an entry linked in the h-feed """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.expect_requests_get('http://author', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="nonexistent.html"></a> </article> </html> """) if raise_exception: self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError()) else: self.expect_requests_get('http://author/nonexistent.html', status_code=410) self.mox.ReplayAll() original_post_discovery.discover(source, activity) # we should have saved placeholders to prevent us from trying the # syndication url or permalink again self.assert_equals( set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]), set((relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)))
def test_get_or_insert_by_syndication_replace(self): """Make sure we replace original=None with original=something when it is discovered""" r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication)
def test_refetch_unchanged_syndication(self): """We should preserve unchanged SyndicatedPosts during refetches.""" synd = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url') synd.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </div> </html>""") self.mox.ReplayAll() refetch(self.source) self.assert_entities_equal([synd], list(SyndicatedPost.query()))
def test_no_author_url(self): """Make sure something reasonable happens when the author doesn't have a url at all. """ self.source.domain_urls = [] discover(self.source, self.activity) # nothing attempted, and no SyndicatedPost saved self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_get_or_save_restart_existing_new_synd_url(self): source = self.sources[0] response = self.responses[0] response.put() # new syndication URL. should add two unsent URLs. synd = source.canonicalize_url(self.activities[0]['url']) SyndicatedPost(parent=source.key, original='http://or/ig', syndication=synd).put() SyndicatedPost(parent=source.key, original=None, syndication=synd).put() # check that we don't die on blanks self.expect_task('propagate', response_key=response) self.mox.ReplayAll() final = response.get_or_save(source, restart=True) self.assert_equals(['http://or/ig', 'http://target1/post/url'], final.unsent)
def test_merge_front_page_and_h_feed(self): """Make sure we are correctly merging the front page and rel-feed by checking that we visit h-entries that are only the front page or only the rel-feed page. """ activity = self.activities[0] activity['object'].update({ 'content': 'post content without backlink', 'url': 'https://fa.ke/post/url', 'upstreamDuplicates': ['existing uD'], }) # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/only-on-frontpage"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> </html>""") self.expect_requests_get('http://author/feed', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/only-on-feed"></a> </div> </html>""") for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'): self.expect_requests_get('http://author%s' % orig, """<div class="h-entry"> <a class="u-url" href="%s"></a> </div>""" % orig).InAnyOrder() self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # should be three blank SyndicatedPosts now for orig in ('http://author/only-on-frontpage', 'http://author/on-both', 'http://author/only-on-feed'): logging.debug('checking %s', orig) sp = SyndicatedPost.query( SyndicatedPost.original == orig, ancestor=source.key).get() self.assertTrue(sp) self.assertIsNone(sp.syndication)
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEqual('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key).fetch() self.assertCountEqual([ 'http://original/post/url', 'http://original/another/post', 'http://original/different/url' ], [rel.original for rel in rs])
def test_get_or_insert_by_syndication_do_not_replace(self): """Make sure we don't replace original=something with original=something else (in practice, that would mean another task is running discovery concurrently and found a different url) """ r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/post/url', r.original) # make sure it's unchanged in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/post/url', rs[0].original) self.assertEquals('http://silo/post/url', rs[0].syndication)
def test_refetch_blank_syndication(self): """We should preserve blank SyndicatedPosts during refetches.""" blank = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication=None) blank.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> </div> </html>""") self.expect_requests_get('http://author/permalink', """ <html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""") self.mox.ReplayAll() self.assert_equals({}, refetch(self.source)) self.assert_syndicated_posts(('http://author/permalink', None))
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertItemsEqual(['http://original/post/url', 'http://original/another/post', 'http://original/different/url'], [rel.original for rel in rs])
def test_no_author_url(self): """Make sure something reasonable happens when the author doesn't have a url at all. """ source = self.sources[0] source.domain_urls = [] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.mox.ReplayAll() original_post_discovery.discover(source, activity) # nothing attempted, and no SyndicatedPost saved self.assertFalse(SyndicatedPost.query(ancestor=source.key).get())
def test_retry(self): self.assertEqual([], self.taskqueue_stub.GetTasks('propagate')) source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = \ testutil.NOW - datetime.timedelta(minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json.dumps({'object': {'url': 'https://fa.ke/1'}}), json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}), json.dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') # cached webmention endpoint memcache.set('W https skipped /', 'asdf') key = resp.key.urlsafe() response = app.application.get_response( '/retry', method='POST', body=native_str(urllib.parse.urlencode({'key': key}))) self.assertEquals(302, response.status_int) self.assertEquals(source.bridgy_url(self.handler), response.headers['Location'].split('#')[0]) params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0]) self.assertEqual(key, params['response_key']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertItemsEqual( ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertIsNone(memcache.get('W https skipped /')) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert(self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEqual('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs)) self.assertEqual('http://original/newly-discovered', rs[0].original) self.assertEqual('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def test_retry(self): self.assertEqual([], self.taskqueue_stub.GetTasks('propagate')) source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = \ testutil.NOW - datetime.timedelta(minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json.dumps({'object': {'url': 'https://fa.ke/1'}}), json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}), json.dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') # cached webmention endpoint memcache.set('W https skipped /', 'asdf') key = resp.key.urlsafe() response = app.application.get_response( '/retry', method='POST', body=urllib.urlencode({'key': key})) self.assertEquals(302, response.status_int) self.assertEquals(source.bridgy_url(self.handler), response.headers['Location'].split('#')[0]) params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0]) self.assertEqual(key, params['response_key']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertItemsEqual( ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertIsNone(memcache.get('W https skipped /')) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
def test_refetch_two_permalinks_same_syndication(self): """ This causes a problem if refetch assumes that syndication-url is unique under a given source. """ source = self.sources[0] source.domain_urls = ['http://author'] self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/post1"></a> <a class="h-entry" href="/post2"></a> </html>""" self.expect_requests_get('http://author', hfeed) for i in range(2): self.expect_requests_get( 'http://author/post%d' % (i + 1), """<html class="h-entry"> <a class="u-url" href="/post%d"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" % (i + 1)) # refetch should only grab the feed self.expect_requests_get('http://author', hfeed) self.mox.ReplayAll() activity = original_post_discovery.discover(source, self.activities[0]) self.assertItemsEqual(['http://author/post1', 'http://author/post2'], activity['object'].get('upstreamDuplicates')) relations = SyndicatedPost.query(ancestor=source.key).fetch() self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'), ('http://author/post2', 'https://fa.ke/post/url')], [(relation.original, relation.syndication) for relation in relations]) # discover should have already handled all relationships, refetch should # not find anything refetch_result = original_post_discovery.refetch(source) self.assertFalse(refetch_result)
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def test_refetch_permalink_with_two_syndications(self): """Test one permalink with two syndicated posts. Make sure that refetch doesn't have a problem with two entries for the same original URL. """ for idx, activity in enumerate(self.activities): activity['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url%d' % (idx + 1), }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" hentry = """<html class="h-entry"> <a class="u-url" href="/permalink"/> <a class="u-syndication" href="https://fa.ke/post/url1"/> <a class="u-syndication" href="https://fa.ke/post/url3"/> <a class="u-syndication" href="https://fa.ke/post/url5"/> </html>""" self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', hentry) # refetch self.expect_requests_get('http://author', hfeed) # refetch grabs posts that it's seen before in case there have # been updates self.expect_requests_get('http://author/permalink', hentry) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) relations = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch() self.assertItemsEqual( [('http://author/permalink', 'https://fa.ke/post/url1'), ('http://author/permalink', 'https://fa.ke/post/url3'), ('http://author/permalink', 'https://fa.ke/post/url5')], [(r.original, r.syndication) for r in relations]) results = original_post_discovery.refetch(self.source) self.assertFalse(results)
def _process_syndication_urls(source, permalink, syndication_urls, preexisting): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new :class:`models.SyndicatedPost` in the db. Args: source: a :class:`models.Source` subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication urls preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s Returns: dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for url in syndication_urls: # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) url = source.canonicalize_url(url) if not url: continue # TODO: save future lookups by saving results for other sources too (note: # query the appropriate source subclass by author.domains, rather than # author.domain_urls) # # we may have already seen this relationship, save a DB lookup by # finding it in the preexisting list relationship = next( (sp for sp in preexisting if sp.syndication == url and sp.original == permalink), None) if not relationship: logging.debug('saving discovered relationship %s -> %s', url, permalink) relationship = SyndicatedPost.insert(source, syndication=url, original=permalink) results.setdefault(url, []).append(relationship) return results
def test_refetch_changed_syndication(self): """Update syndication links that have changed since our last fetch.""" SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url').put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="http://fa.ke/changed/url"></a> </div> </html>""") self.mox.ReplayAll() results = refetch(self.source) self.assert_syndicated_posts( ('http://author/permalink', 'https://fa.ke/changed/url')) self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())}, results)
def _process_syndication_urls(source, permalink, syndication_urls, preexisting): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new :class:`models.SyndicatedPost` in the db. Args: source: a :class:`models.Source` subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication urls preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s Returns: dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for url in syndication_urls: # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) url = source.canonicalize_url(url) if not url: continue # TODO: save future lookups by saving results for other sources too (note: # query the appropriate source subclass by author.domains, rather than # author.domain_urls) # # we may have already seen this relationship, save a DB lookup by # finding it in the preexisting list relationship = next((sp for sp in preexisting if sp.syndication == url and sp.original == permalink), None) if not relationship: logging.debug('saving discovered relationship %s -> %s', url, permalink) relationship = SyndicatedPost.insert( source, syndication=url, original=permalink) results.setdefault(url, []).append(relationship) return results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds( feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info( 'rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s for this permalink store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Returns: a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logger.debug( f'previously found relationship(s) for original {permalink}: {synds}' ) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) usynd_urls = {url for url in usynd if isinstance(url, str)} if usynd_urls: logger.debug( f'u-syndication links on the h-feed h-entry: {usynd_urls}') results = _process_syndication_urls(source, permalink, usynd_urls, preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url or not feed_entry: # fetch the full permalink page if we think it might have more details mf2 = None try: if type_ok: logger.debug(f'fetching post permalink {permalink}') mf2 = util.fetch_mf2(permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logger.info(f'Could not fetch permalink {permalink}', exc_info=True) success = False if mf2: syndication_urls = set() relsynd = mf2['rels'].get('syndication', []) if relsynd: logger.debug(f'rel-syndication links: {relsynd}') syndication_urls.update(url for url in relsynd if isinstance(url, str)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in mf2['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logger.debug(f'u-syndication links: {usynd}') syndication_urls.update(url for url in usynd if isinstance(url, str)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = list(itertools.chain(*results.values())) for syndpost in preexisting: if syndpost.syndication and syndpost not in result_syndposts: logger.info( f'deleting relationship that disappeared: {syndpost}') syndpost.key.delete() preexisting.remove(syndpost) if not results: logger.debug( f'no syndication links from {permalink} to current source {source.label()}.' ) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logger.debug( f'saving empty relationship so that {permalink} will not be searched again' ) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.items(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logger.debug(f'discovered relationships {new_results}') return new_results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} logger.debug(f'fetching author url {author_url}') try: author_mf2 = util.fetch_mf2(author_url) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logger.info(f'Could not fetch author url {author_url}', exc_info=True) return {} feeditems = _find_feed_items(author_mf2) # try rel=feeds and rel=alternates feed_urls = set() candidates = (author_mf2['rels'].get('feed', []) + [ a.get('url') for a in author_mf2.get('alternates', []) if a.get('type') == MF2_HTML_MIME_TYPE ]) for feed_url in candidates: # check that it's html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logger.debug('author url is the feed url, ignoring') elif not feed_ok: logger.debug("skipping feed since it's not HTML or otherwise bad") else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logger.debug(f"fetching author's rel-feed {feed_url}") feed_mf2 = util.fetch_mf2(feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logger.info( f'rel-feed found new domain {domain}! adding to source' ) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logger.info(f'Could not fetch h-feed url {feed_url}.', exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') or '' feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logger.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, str): permalink_to_entry[permalink] = child else: logger.warning( f'unexpected non-string "url" property: {permalink}') max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logger.info(f'Hit cap of {max} permalinks. Stopping.') break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.items(): logger.debug(f'processing permalink: {permalink}') new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.items(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _process_entry(source, permalink, feed_entry, refetch, preexisting, store_blanks=True): """Fetch and process an h-entry, saving a new SyndicatedPost to the DB if successful. Args: source: permalink: url of the unprocessed post feed_entry: the h-feed version of the h-entry dict, often contains a partial version of the h-entry at the permalink refetch: boolean, whether to refetch and process entries we've seen before preexisting: a list of previously discovered models.SyndicatedPosts for this permalink store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Returns: a dict from syndicated url to a list of new models.SyndicatedPosts """ # if the post has already been processed, do not add to the results # since this method only returns *newly* discovered relationships. if preexisting: # if we're refetching and this one is blank, do not return. # if there is a blank entry, it should be the one and only entry, # but go ahead and check 'all' of them to be safe. if not refetch: return {} synds = [s.syndication for s in preexisting if s.syndication] if synds: logging.debug( 'previously found relationship(s) for original %s: %s', permalink, synds) # first try with the h-entry from the h-feed. if we find the syndication url # we're looking for, we don't have to fetch the permalink permalink, _, type_ok = util.get_webmention_target(permalink) usynd = feed_entry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links on the h-feed h-entry: %s', usynd) results = _process_syndication_urls( source, permalink, set(url for url in usynd if isinstance(url, basestring)), preexisting) success = True if results: source.updates['last_feed_syndication_url'] = util.now_fn() elif not source.last_feed_syndication_url: # fetch the full permalink page if we think it might have more details parsed = None try: logging.debug('fetching post permalink %s', permalink) if type_ok: resp = util.requests_get(permalink) resp.raise_for_status() parsed = util.mf2py_parse(resp.text, permalink) except AssertionError: raise # for unit tests except BaseException: # TODO limit the number of allowed failures logging.warning('Could not fetch permalink %s', permalink, exc_info=True) success = False if parsed: syndication_urls = set() relsynd = parsed.get('rels').get('syndication', []) if relsynd: logging.debug('rel-syndication links: %s', relsynd) syndication_urls.update(url for url in relsynd if isinstance(url, basestring)) # there should only be one h-entry on a permalink page, but # we'll check all of them just in case. for hentry in (item for item in parsed['items'] if 'h-entry' in item['type']): usynd = hentry.get('properties', {}).get('syndication', []) if usynd: logging.debug('u-syndication links: %s', usynd) syndication_urls.update(url for url in usynd if isinstance(url, basestring)) results = _process_syndication_urls(source, permalink, syndication_urls, preexisting) # detect and delete SyndicatedPosts that were removed from the site if success: result_syndposts = itertools.chain(*results.values()) for syndpost in list(preexisting): if syndpost.syndication and syndpost not in result_syndposts: logging.info('deleting relationship that disappeared: %s', syndpost) syndpost.key.delete() preexisting.remove(syndpost) if not results: logging.debug('no syndication links from %s to current source %s.', permalink, source.label()) results = {} if store_blanks and not preexisting: # remember that this post doesn't have syndication links for this # particular source logging.debug( 'saving empty relationship so that %s will not be ' 'searched again', permalink) SyndicatedPost.insert_original_blank(source, permalink) # only return results that are not in the preexisting list new_results = {} for syndurl, syndposts_for_url in results.iteritems(): for syndpost in syndposts_for_url: if syndpost not in preexisting: new_results.setdefault(syndurl, []).append(syndpost) if new_results: logging.debug('discovered relationships %s', new_results) return new_results
def test_retry(self): source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = testutil.NOW - timedelta( minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json_dumps({'object': { 'url': 'https://fa.ke/1' }}), json_dumps({ 'url': 'https://fa.ke/2', 'object': { 'unused': 'ok' } }), json_dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') key = resp.key.urlsafe().decode() self.expect_task('propagate', response_key=key) self.mox.ReplayAll() # cached webmention endpoint util.webmention_endpoint_cache['W https skipped /'] = 'asdf' response = self.client.post('/retry', data={'key': key}) self.assertEqual(302, response.status_code) self.assertEqual(self.source_bridgy_url, response.headers['Location']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertCountEqual([ 'http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3' ], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertNotIn('W https skipped /', util.webmention_endpoint_cache) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)