def test_syndication_url_in_hfeed(self): """Like test_single_post, but because the syndication URL is given in the h-feed we skip fetching the permalink. New behavior as of 2014-11-08 """ self.activity['object']['upstreamDuplicates'] = ['existing uD'] # silo domain is fa.ke self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> <a class="u-syndication" href="http://fa.ke/post/url"> </div> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', self.source, self.activity) original_post_discovery.discover(self.source, self.activity) # upstreamDuplicates = 1 original + 1 discovered self.assertEquals(['existing uD', 'http://author/post/permalink'], self.activity['object']['upstreamDuplicates']) origurls = [r.original for r in SyndicatedPost.query(ancestor=self.source.key)] self.assertEquals([u'http://author/post/permalink'], origurls) # for now only syndicated posts belonging to this source are stored syndurls = list(r.syndication for r in SyndicatedPost.query(ancestor=self.source.key)) self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def test_multiple_refetches(self): """Ensure that multiple refetches of the same post (with and without u-syndication) does not generate duplicate blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" unsyndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""" syndicated = """<html class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" # first attempt, no syndication url yet self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # refetch, still no syndication url self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', unsyndicated) # second refetch, has a syndication url this time self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', syndicated) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertIsNone(relations[0].syndication) original_post_discovery.refetch(self.source) relations = list( SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch()) self.assertEquals(1, len(relations)) self.assertEquals('http://author/permalink', relations[0].original) self.assertEquals('https://fa.ke/post/url', relations[0].syndication)
def test_multiple_rel_feeds(self): """Make sure that we follow all rel=feed links, e.g. if notes and articles are in separate feeds.""" self.expect_requests_get('http://author', """ <html> <head> <link rel="feed" href="/articles" type="text/html"> <link rel="feed" href="/notes" type="text/html"> </head> </html>""") # fetches all feeds first self.expect_requests_get('http://author/articles', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/article-permalink"></a> </article> </html>""").InAnyOrder('feed') self.expect_requests_get('http://author/notes', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="/note-permalink"></a> </article> </html>""").InAnyOrder('feed') # then the permalinks (in any order since they are hashed to # remove duplicates) self.expect_requests_get('http://author/article-permalink', """ <html class="h-entry"> <a class="u-url" href="/article-permalink"></a> <a class="u-syndication" href="https://fa.ke/article"></a> </html>""").InAnyOrder('permalink') self.expect_requests_get('http://author/note-permalink', """ <html class="h-entry"> <a class="u-url" href="/note-permalink"></a> <a class="u-syndication" href="https://fa.ke/note"></a> </html>""").InAnyOrder('permalink') self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activity) note_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/note-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(note_rels)) self.assertEqual('https://fa.ke/note', note_rels[0].syndication) article_rels = SyndicatedPost.query( SyndicatedPost.original == 'http://author/article-permalink', ancestor=self.source.key).fetch() self.assertEqual(1, len(article_rels)) self.assertEqual('https://fa.ke/article', article_rels[0].syndication)
def test_refetch_multiple_responses_same_activity(self): """Ensure that refetching a post that has several replies does not generate duplicate original -> None blank entries in the database. See https://github.com/snarfed/bridgy/issues/259 for details """ source = self.sources[0] source.domain_urls = ['http://author'] for activity in self.activities: activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'https://fa.ke/post/url' author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""" author_entry = """ <html class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </html>""" # original self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) # refetch self.expect_requests_get('http://author', author_feed) self.expect_requests_get('http://author/post/permalink', author_entry) self.mox.ReplayAll() for activity in self.activities: original_post_discovery.discover(source, activity) original_post_discovery.refetch(source) rels_by_original = list( SyndicatedPost.query(SyndicatedPost.original == 'http://author/post/permalink', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_original)) self.assertIsNone(rels_by_original[0].syndication) rels_by_syndication = list( SyndicatedPost.query(SyndicatedPost.syndication == 'https://fa.ke/post/url', ancestor=source.key).fetch()) self.assertEquals(1, len(rels_by_syndication)) self.assertIsNone(rels_by_syndication[0].original)
def test_single_post(self): """Test that original post discovery does the reverse lookup to scan author's h-feed for rel=syndication links """ activity = self.activities[0] activity['object'].update({ 'content': 'post content without backlink', 'url': 'http://fa.ke/post/url', 'upstreamDuplicates': ['existing uD'], }) # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get( 'http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div> </html>""") # syndicated to two places self.expect_requests_get( 'http://author/post/permalink', """ <link rel="syndication" href="http://not.real/statuses/postid"> <link rel="syndication" href="http://fa.ke/post/url"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink"></a> </div>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # upstreamDuplicates = 1 original + 1 discovered self.assertEquals(['existing uD', 'http://author/post/permalink'], activity['object']['upstreamDuplicates']) origurls = [ r.original for r in SyndicatedPost.query(ancestor=source.key) ] self.assertEquals([u'http://author/post/permalink'], origurls) # for now only syndicated posts belonging to this source are stored syndurls = list( r.syndication for r in SyndicatedPost.query(ancestor=source.key)) self.assertEquals([u'https://fa.ke/post/url'], syndurls)
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get('http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover('http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertItemsEqual([ {'https://fa.ke/222': 'http://si.te/123'}, {'https://fa.ke/post/444': 'http://si.te/123'}, ], [{sp.syndication: sp.original} for sp in models.SyndicatedPost.query()]) tasks = self.taskqueue_stub.GetTasks('discover') key = self.source.key.urlsafe() self.assertEqual([ {'source_key': key, 'post_id': '222'}, {'source_key': key, 'post_id': '444'}, ], [testutil.get_task_params(task) for task in tasks]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def _test_failed_post_permalink_fetch(self, raise_exception): """Make sure something reasonable happens when we're unable to fetch the permalink of an entry linked in the h-feed """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.expect_requests_get('http://author', """ <html class="h-feed"> <article class="h-entry"> <a class="u-url" href="nonexistent.html"></a> </article> </html> """) if raise_exception: self.expect_requests_get('http://author/nonexistent.html').AndRaise(HTTPError()) else: self.expect_requests_get('http://author/nonexistent.html', status_code=410) self.mox.ReplayAll() original_post_discovery.discover(source, activity) # we should have saved placeholders to prevent us from trying the # syndication url or permalink again self.assert_equals( set([('http://author/nonexistent.html', None), (None, 'https://fa.ke/post/url')]), set((relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)))
def test_discover_url_site_post_syndication_links(self): self.expect_requests_get( 'http://si.te/123', """ <div class="h-entry"> foo <a class="u-syndication" href="http://fa.ke/222"></a> <a class="u-syndication" href="http://other/silo"></a> <a class="u-syndication" href="http://fa.ke/post/444"></a> </div>""") self.expect_task('discover', source_key=self.source, post_id='222') self.expect_task('discover', source_key=self.source, post_id='444') self.mox.ReplayAll() self.assertEqual(0, SyndicatedPost.query().count()) self.check_discover( 'http://si.te/123', 'Discovering now. Refresh in a minute to see the results!') self.assertCountEqual([ { 'https://fa.ke/222': 'http://si.te/123' }, { 'https://fa.ke/post/444': 'http://si.te/123' }, ], [{ sp.syndication: sp.original } for sp in models.SyndicatedPost.query()]) now = util.now_fn() source = self.source.key.get() self.assertEqual(now, source.last_syndication_url)
def test_no_h_entries(self): """Make sure nothing bad happens when fetching a feed without h-entries """ activity = self.activities[0] activity['object']['content'] = 'post content without backlink' activity['object']['url'] = 'https://fa.ke/post/url' # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <html class="h-feed"> <p>under construction</p> </html>""") self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) self.assert_equals( [(None, 'https://fa.ke/post/url')], [(relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)])
def test_do_not_fetch_hfeed(self): """Confirms behavior of discover() when fetch_hfeed=False. Discovery should only check the database for previously discovered matches. It should not make any GET requests """ discover(self.source, self.activity, fetch_hfeed=False) self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_no_author_url(self): """Make sure something reasonable happens when the author doesn't have a url at all. """ self.source.domain_urls = [] discover(self.source, self.activity) # nothing attempted, and no SyndicatedPost saved self.assertFalse(SyndicatedPost.query(ancestor=self.source.key).get())
def test_merge_front_page_and_h_feed(self): """Make sure we are correctly merging the front page and rel-feed by checking that we visit h-entries that are only the front page or only the rel-feed page. """ activity = self.activities[0] activity['object'].update({ 'content': 'post content without backlink', 'url': 'https://fa.ke/post/url', 'upstreamDuplicates': ['existing uD'], }) # silo domain is fa.ke source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/only-on-frontpage"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> </html>""") self.expect_requests_get('http://author/feed', """ <link rel="feed" href="/feed"> <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/on-both"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/only-on-feed"></a> </div> </html>""") for orig in ('/only-on-frontpage', '/on-both', '/only-on-feed'): self.expect_requests_get('http://author%s' % orig, """<div class="h-entry"> <a class="u-url" href="%s"></a> </div>""" % orig).InAnyOrder() self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # should be three blank SyndicatedPosts now for orig in ('http://author/only-on-frontpage', 'http://author/on-both', 'http://author/only-on-feed'): logging.debug('checking %s', orig) sp = SyndicatedPost.query( SyndicatedPost.original == orig, ancestor=source.key).get() self.assertTrue(sp) self.assertIsNone(sp.syndication)
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert(self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEqual('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs)) self.assertEqual('http://original/newly-discovered', rs[0].original) self.assertEqual('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds): """Performs the actual meat of the posse-post-discover. Args: source: :class:`models.Source` subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship already_fetched_hfeeds: set, URLs we've already fetched in a previous iteration Return: sequence of string original post urls, possibly empty """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # TODO: Consider using the actor's url, with get_author_urls() as the # fallback in the future to support content from non-Bridgy users. results = {} for url in _get_author_urls(source): if url not in already_fetched_hfeeds: results.update(_process_author(source, url)) already_fetched_hfeeds.add(url) else: logging.debug('skipping %s, already fetched this round', url) relationships = results.get(syndication_url, []) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) originals = [r.original for r in relationships if r.original] if originals: logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, originals) return originals
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank(self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertCountEqual([None], [rel.original for rel in rs])
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank( self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertItemsEqual([None], [rel.original for rel in rs])
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. Args: source: models.Source subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # Use source.domain_urls for now; it seems more reliable than the # activity.actor.url (which depends on getting the right data back from # various APIs). Consider using the actor's url, with domain_urls as the # fallback in the future to support content from non-Bridgy users. results = {} for url in source.get_author_urls(): results.update(_process_author(source, url)) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(unicode(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity
def test_no_author_url(self): """Make sure something reasonable happens when the author doesn't have a url at all. """ source = self.sources[0] source.domain_urls = [] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.mox.ReplayAll() original_post_discovery.discover(source, activity) # nothing attempted, and no SyndicatedPost saved self.assertFalse(SyndicatedPost.query(ancestor=source.key).get())
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs))
def _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. It was split out from discover() so that it can be done inside of a transaction. Args: source: models.Source subclass activity: activity dict author_url: author's url configured in their silo profile syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info( 'starting posse post discovery with author %s and syndicated %s', author_url, syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's # h-feed to see if we can find it. results = _process_author(source, author_url) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(str(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key ).fetch() self.assertEqual(1, len(rs))
def test_get_or_insert_by_syndication_replace(self): """Make sure we replace original=None with original=something when it is discovered""" r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication)
def test_refetch_unchanged_syndication(self): """We should preserve unchanged SyndicatedPosts during refetches.""" synd = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url') synd.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </div> </html>""") self.mox.ReplayAll() refetch(self.source) self.assert_entities_equal([synd], list(SyndicatedPost.query()))
def test_refetch_two_permalinks_same_syndication(self): """ This causes a problem if refetch assumes that syndication-url is unique under a given source. """ source = self.sources[0] source.domain_urls = ['http://author'] self.activities[0]['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url', }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/post1"></a> <a class="h-entry" href="/post2"></a> </html>""" self.expect_requests_get('http://author', hfeed) for i in range(2): self.expect_requests_get( 'http://author/post%d' % (i + 1), """<html class="h-entry"> <a class="u-url" href="/post%d"></a> <a class="u-syndication" href="https://fa.ke/post/url"></a> </html>""" % (i + 1)) # refetch should only grab the feed self.expect_requests_get('http://author', hfeed) self.mox.ReplayAll() activity = original_post_discovery.discover(source, self.activities[0]) self.assertItemsEqual(['http://author/post1', 'http://author/post2'], activity['object'].get('upstreamDuplicates')) relations = SyndicatedPost.query(ancestor=source.key).fetch() self.assertItemsEqual([('http://author/post1', 'https://fa.ke/post/url'), ('http://author/post2', 'https://fa.ke/post/url')], [(relation.original, relation.syndication) for relation in relations]) # discover should have already handled all relationships, refetch should # not find anything refetch_result = original_post_discovery.refetch(source) self.assertFalse(refetch_result)
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEqual('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key).fetch() self.assertCountEqual([ 'http://original/post/url', 'http://original/another/post', 'http://original/different/url' ], [rel.original for rel in rs])
def test_refetch_permalink_with_two_syndications(self): """Test one permalink with two syndicated posts. Make sure that refetch doesn't have a problem with two entries for the same original URL. """ for idx, activity in enumerate(self.activities): activity['object'].update({ 'content': 'post content without backlinks', 'url': 'https://fa.ke/post/url%d' % (idx + 1), }) hfeed = """<html class="h-feed"> <a class="h-entry" href="/permalink"></a> </html>""" hentry = """<html class="h-entry"> <a class="u-url" href="/permalink"/> <a class="u-syndication" href="https://fa.ke/post/url1"/> <a class="u-syndication" href="https://fa.ke/post/url3"/> <a class="u-syndication" href="https://fa.ke/post/url5"/> </html>""" self.expect_requests_get('http://author', hfeed) self.expect_requests_get('http://author/permalink', hentry) # refetch self.expect_requests_get('http://author', hfeed) # refetch grabs posts that it's seen before in case there have # been updates self.expect_requests_get('http://author/permalink', hentry) self.mox.ReplayAll() original_post_discovery.discover(self.source, self.activities[0]) relations = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink', ancestor=self.source.key).fetch() self.assertItemsEqual( [('http://author/permalink', 'https://fa.ke/post/url1'), ('http://author/permalink', 'https://fa.ke/post/url3'), ('http://author/permalink', 'https://fa.ke/post/url5')], [(r.original, r.syndication) for r in relations]) results = original_post_discovery.refetch(self.source) self.assertFalse(results)
def test_refetch_changed_syndication(self): """Update syndication links that have changed since our last fetch.""" SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication='https://fa.ke/post/url').put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink"></a> <a class="u-syndication" href="http://fa.ke/changed/url"></a> </div> </html>""") self.mox.ReplayAll() results = refetch(self.source) self.assert_syndicated_posts( ('http://author/permalink', 'https://fa.ke/changed/url')) self.assert_equals({'https://fa.ke/changed/url': list(SyndicatedPost.query())}, results)
def test_get_or_insert_by_syndication_do_not_replace(self): """Make sure we don't replace original=something with original=something else (in practice, that would mean another task is running discovery concurrently and found a different url) """ r = SyndicatedPost.get_or_insert_by_syndication_url( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/post/url', r.original) # make sure it's unchanged in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/post/url', rs[0].original) self.assertEquals('http://silo/post/url', rs[0].syndication)
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertItemsEqual(['http://original/post/url', 'http://original/another/post', 'http://original/different/url'], [rel.original for rel in rs])
def test_refetch_blank_syndication(self): """We should preserve blank SyndicatedPosts during refetches.""" blank = SyndicatedPost(parent=self.source.key, original='http://author/permalink', syndication=None) blank.put() self.expect_requests_get('http://author', """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="/permalink" /> </div> </html>""") self.expect_requests_get('http://author/permalink', """ <html class="h-entry"> <a class="u-url" href="/permalink"></a> </html>""") self.mox.ReplayAll() self.assert_equals({}, original_post_discovery.refetch(self.source)) self.assert_syndicated_posts(('http://author/permalink', None)) self.assert_entities_equal([blank], list(SyndicatedPost.query()))
def test_invalid_webmention_target(self): """Confirm that no additional requests are made if the author url is an invalid webmention target. Right now this pretty much just means they're on the blacklist. Eventually we want to filter out targets that don't have certain features, like a webmention endpoint or microformats. """ source = self.sources[0] source.domain_urls = ['http://amazon.com'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' self.mox.ReplayAll() logging.debug('Original post discovery %s -> %s', source, activity) original_post_discovery.discover(source, activity) # nothing attempted, but we should have saved a placeholder to prevent us # from trying again self.assert_equals( [(None, 'https://fa.ke/post/url')], [(relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)])
def _test_failed_domain_url_fetch(self, raise_exception): """Make sure something reasonable happens when the author's domain url gives an unexpected response """ source = self.sources[0] source.domain_urls = ['http://author'] activity = self.activities[0] activity['object']['url'] = 'https://fa.ke/post/url' activity['object']['content'] = 'content without links' if raise_exception: self.expect_requests_get('http://author').AndRaise(HTTPError()) else: self.expect_requests_get('http://author', status_code=404) self.mox.ReplayAll() original_post_discovery.discover(source, activity) # nothing attempted, but we should have saved a placeholder to prevent us # from trying again self.assert_equals( [(None, 'https://fa.ke/post/url')], [(relationship.original, relationship.syndication) for relationship in SyndicatedPost.query(ancestor=source.key)])
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of models.Source author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank SyndicatedPosts when we don't find a relationship Return: a dict of syndicated_url to a list of new models.SyndicatedPost """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. # TODO skip sites we know don't have microformats2 markup author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = BeautifulSoup(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.warning('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if not feed_type: # type is not specified, use this to confirm that it's text/html feed_url, _, feed_type_ok = util.get_webmention_target(feed_url) else: feed_type_ok = feed_type == 'text/html' if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_type_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info('rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.warning('Could not fetch h-feed url %s.', feed_url, exc_info=True) permalink_to_entry = {} for child in feeditems: if 'h-entry' in child['type']: # TODO maybe limit to first ~30 entries? (do that here rather than, # below because we want the *first* n entries) for permalink in child['properties'].get('url', []): if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query( SyndicatedPost.original.IN(permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = _process_entry( source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling now = util.now_fn() logging.debug('updating source last_syndication_url %s', now) source.updates['last_syndication_url'] = now return results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} logger.debug(f'fetching author url {author_url}') try: author_mf2 = util.fetch_mf2(author_url) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logger.info(f'Could not fetch author url {author_url}', exc_info=True) return {} feeditems = _find_feed_items(author_mf2) # try rel=feeds and rel=alternates feed_urls = set() candidates = (author_mf2['rels'].get('feed', []) + [ a.get('url') for a in author_mf2.get('alternates', []) if a.get('type') == MF2_HTML_MIME_TYPE ]) for feed_url in candidates: # check that it's html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logger.debug('author url is the feed url, ignoring') elif not feed_ok: logger.debug("skipping feed since it's not HTML or otherwise bad") else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logger.debug(f"fetching author's rel-feed {feed_url}") feed_mf2 = util.fetch_mf2(feed_url) feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logger.info( f'rel-feed found new domain {domain}! adding to source' ) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logger.info(f'Could not fetch h-feed url {feed_url}.', exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') or '' feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logger.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, str): permalink_to_entry[permalink] = child else: logger.warning( f'unexpected non-string "url" property: {permalink}') max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logger.info(f'Hit cap of {max} permalinks. Stopping.') break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.items(): logger.debug(f'processing permalink: {permalink}') new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.items(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def _process_author(source, author_url, refetch=False, store_blanks=True): """Fetch the author's domain URL, and look for syndicated posts. Args: source: a subclass of :class:`models.Source` author_url: the author's homepage URL refetch: boolean, whether to refetch and process entries we've seen before store_blanks: boolean, whether we should store blank :class:`models.SyndicatedPost`\ s when we don't find a relationship Return: a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s """ # for now use whether the url is a valid webmention target # as a proxy for whether it's worth searching it. author_url, _, ok = util.get_webmention_target(author_url) if not ok: return {} try: logging.debug('fetching author url %s', author_url) author_resp = util.requests_get(author_url) # TODO for error codes that indicate a temporary error, should we make # a certain number of retries before giving up forever? author_resp.raise_for_status() author_dom = util.beautifulsoup_parse(author_resp.text) except AssertionError: raise # for unit tests except BaseException: # TODO limit allowed failures, cache the author's h-feed url # or the # of times we've failed to fetch it logging.info('Could not fetch author url %s', author_url, exc_info=True) return {} feeditems = _find_feed_items(author_url, author_dom) # look for all other feed urls using rel='feed', type='text/html' feed_urls = set() for rel_feed_node in (author_dom.find_all('link', rel='feed') + author_dom.find_all('a', rel='feed')): feed_url = rel_feed_node.get('href') if not feed_url: continue feed_url = urlparse.urljoin(author_url, feed_url) feed_type = rel_feed_node.get('type') if feed_type and feed_type != 'text/html': feed_ok = False else: # double check that it's text/html, not too big, etc feed_url, _, feed_ok = util.get_webmention_target(feed_url) if feed_url == author_url: logging.debug('author url is the feed url, ignoring') elif not feed_ok: logging.debug('skipping feed of type %s', feed_type) else: feed_urls.add(feed_url) for feed_url in feed_urls: try: logging.debug("fetching author's rel-feed %s", feed_url) feed_resp = util.requests_get(feed_url) feed_resp.raise_for_status() logging.debug("author's rel-feed fetched successfully %s", feed_url) feeditems = _merge_hfeeds( feeditems, _find_feed_items(feed_url, feed_resp.text)) domain = util.domain_from_link(feed_url) if source.updates is not None and domain not in source.domains: domains = source.updates.setdefault('domains', source.domains) if domain not in domains: logging.info( 'rel-feed found new domain %s! adding to source', domain) domains.append(domain) except AssertionError: raise # reraise assertions for unit tests except BaseException: logging.info('Could not fetch h-feed url %s.', feed_url, exc_info=True) # sort by dt-updated/dt-published def updated_or_published(item): props = microformats2.first_props(item.get('properties')) return props.get('updated') or props.get('published') feeditems.sort(key=updated_or_published, reverse=True) permalink_to_entry = collections.OrderedDict() for child in feeditems: if 'h-entry' in child['type']: permalinks = child['properties'].get('url', []) if not permalinks: logging.debug('ignoring h-entry with no u-url!') for permalink in permalinks: if isinstance(permalink, basestring): permalink_to_entry[permalink] = child else: logging.warn('unexpected non-string "url" property: %s', permalink) max = (MAX_PERMALINK_FETCHES_BETA if source.is_beta_user() else MAX_PERMALINK_FETCHES) if len(permalink_to_entry) >= max: logging.info('Hit cap of %d permalinks. Stopping.', max) break # query all preexisting permalinks at once, instead of once per link permalinks_list = list(permalink_to_entry.keys()) # fetch the maximum allowed entries (currently 30) at a time preexisting_list = itertools.chain.from_iterable( SyndicatedPost.query(SyndicatedPost.original.IN( permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]), ancestor=source.key) for i in xrange(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES)) preexisting = {} for r in preexisting_list: preexisting.setdefault(r.original, []).append(r) results = {} for permalink, entry in permalink_to_entry.iteritems(): logging.debug('processing permalink: %s', permalink) new_results = process_entry(source, permalink, entry, refetch, preexisting.get(permalink, []), store_blanks=store_blanks) for key, value in new_results.iteritems(): results.setdefault(key, []).extend(value) if source.updates is not None and results: # keep track of the last time we've seen rel=syndication urls for # this author. this helps us decide whether to refetch periodically # and look for updates. # Source will be saved at the end of each round of polling source.updates['last_syndication_url'] = util.now_fn() return results
def test_refetch_hfeed(self): """refetch should grab resources again, even if they were previously marked with a blank SyndicatedPost """ source = self.sources[0] source.domain_urls = ['http://author'] # refetch 1 and 3 to see if they've been updated, 2 has already # been resolved for this source SyndicatedPost(parent=source.key, original='http://author/permalink1', syndication=None).put() SyndicatedPost(parent=source.key, original='http://author/permalink2', syndication='https://fa.ke/post/url2').put() SyndicatedPost(parent=source.key, original='http://author/permalink3', syndication=None).put() self.expect_requests_get('http://author', """ <html class="h-feed"> <a class="h-entry" href="/permalink1"></a> <a class="h-entry" href="/permalink2"></a> <a class="h-entry" href="/permalink3"></a> </html>""") # yay, permalink1 has an updated syndication url self.expect_requests_get('http://author/permalink1', """ <html class="h-entry"> <a class="u-url" href="/permalink1"></a> <a class="u-syndication" href="https://fa.ke/post/url1"></a> </html>""").InAnyOrder() # permalink3 hasn't changed since we first checked it self.expect_requests_get('http://author/permalink3', """ <html class="h-entry"> <a class="u-url" href="/permalink3"></a> </html>""").InAnyOrder() self.mox.ReplayAll() original_post_discovery.refetch(source) relationships1 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink1', ancestor=source.key).fetch() self.assertTrue(relationships1) self.assertEquals('https://fa.ke/post/url1', relationships1[0].syndication) relationships2 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink2', ancestor=source.key).fetch() # this shouldn't have changed self.assertTrue(relationships2) self.assertEquals('https://fa.ke/post/url2', relationships2[0].syndication) relationships3 = SyndicatedPost.query( SyndicatedPost.original == 'http://author/permalink3', ancestor=source.key).fetch() self.assertTrue(relationships3) self.assertIsNone(relationships3[0].syndication)