def test_query_by_syndication_url(self): """Simply testing the query helper""" r = SyndicatedPost.query_by_syndication( self.source, 'http://silo/post/url') self.assertIsNotNone(r) self.assertEquals('http://original/post/url', r.original) r = SyndicatedPost.query_by_syndication( self.source, 'http://silo/no-original') self.assertIsNotNone(r) self.assertIsNone(r.original)
def _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. It was split out from discover() so that it can be done inside of a transaction. Args: source: models.Source subclass activity: activity dict author_url: author's url configured in their silo profile syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info( 'starting posse post discovery with author %s and syndicated %s', author_url, syndication_url) relationship = SyndicatedPost.query_by_syndication(source, syndication_url) if not relationship and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's # h-feed to see if we can find it. results = _process_author(source, author_url) relationship = results.get(syndication_url, None) if not relationship: # No relationship was found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) SyndicatedPost.get_or_insert_by_syndication_url( source, syndication_url, None) return activity logging.debug('posse post discovery found relationship %s -> %s', syndication_url, relationship.original) if relationship.original: obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).append(relationship.original) return activity
def test_additional_requests_do_not_require_rework(self): """Test that original post discovery fetches and stores all entries up front so that it does not have to reparse the author's h-feed for every new post. Test that original post discovery does the reverse lookup to scan author's h-feed for rel=syndication links """ for idx, activity in enumerate(self.activities): activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'http://fa.ke/post/url%d' % (idx + 1) author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink1"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink2"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink3"></a> </div> </html>""" source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', author_feed) # first post is syndicated self.expect_requests_get( 'http://author/post/permalink1', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink1"></a> <a class="u-syndication" href="http://fa.ke/post/url1"></a> </div>""").InAnyOrder() # second post is syndicated self.expect_requests_get( 'http://author/post/permalink2', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink2"></a> <a class="u-syndication" href="http://fa.ke/post/url2"></a> </div>""").InAnyOrder() # third post is not syndicated self.expect_requests_get( 'http://author/post/permalink3', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink3"></a> </div>""").InAnyOrder() # the second activity lookup should not make any HTTP requests # the third activity lookup will fetch the author's h-feed one more time self.expect_requests_get('http://author', author_feed).InAnyOrder() self.mox.ReplayAll() # first activity should trigger all the lookups and storage original_post_discovery.discover(source, self.activities[0]) self.assertEquals(['http://author/post/permalink1'], self.activities[0]['object']['upstreamDuplicates']) # make sure things are where we want them r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink1') self.assertEquals('https://fa.ke/post/url1', r.syndication) r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url1') self.assertEquals('http://author/post/permalink1', r.original) r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink2') self.assertEquals('https://fa.ke/post/url2', r.syndication) r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url2') self.assertEquals('http://author/post/permalink2', r.original) r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink3') self.assertEquals(None, r.syndication) # second lookup should require no additional HTTP requests. # the second syndicated post should be linked up to the second permalink. original_post_discovery.discover(source, self.activities[1]) self.assertEquals(['http://author/post/permalink2'], self.activities[1]['object']['upstreamDuplicates']) # third activity lookup. # since we didn't find a back-link for the third syndicated post, # it should fetch the author's feed again, but seeing no new # posts, it should not follow any of the permalinks original_post_discovery.discover(source, self.activities[2]) # should have found no new syndication link self.assertNotIn('upstreamDuplicates', self.activities[2]['object']) # should have saved a blank to prevent subsequent checks of this # syndicated post from fetching the h-feed again r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url3') self.assertEquals(None, r.original) # confirm that we do not fetch the h-feed again for the same # syndicated post original_post_discovery.discover(source, self.activities[2]) # should be no new syndication link self.assertNotIn('upstreamDuplicates', self.activities[2]['object'])