def test_query_by_original_url(self): """Simply testing the query helper""" r = SyndicatedPost.query_by_original( self.source, 'http://original/post/url') self.assertIsNotNone(r) self.assertEquals('http://silo/post/url', r.syndication) r = SyndicatedPost.query_by_original( self.source, 'http://original/no-syndication') self.assertIsNotNone(r) self.assertIsNone(r.syndication)
def test_refetch_hfeed(self): """refetch should grab resources again, even if they were previously marked with a blank SyndicatedPost """ source = self.sources[0] source.domain_urls = ['http://author'] # refetch 1 and 3 to see if they've been updated, 2 has already # been resolved for this source SyndicatedPost( parent=source.key, original='http://author/permalink1', syndication=None).put() SyndicatedPost( parent=source.key, original='http://author/permalink2', syndication='https://fa.ke/post/url2').put() SyndicatedPost( parent=source.key, original='http://author/permalink3', syndication=None).put() self.expect_requests_get( 'http://author', """ <html class="h-feed"> <a class="h-entry" href="/permalink1"></a> <a class="h-entry" href="/permalink2"></a> <a class="h-entry" href="/permalink3"></a> </html>""") # yay, permalink1 has an updated syndication url self.expect_requests_get( 'http://author/permalink1', """ <html class="h-entry"> <a class="u-url" href="/permalink1"></a> <a class="u-syndication" href="http://fa.ke/post/url1"></a> </html>""").InAnyOrder() # permalink3 hasn't changed since we first checked it self.expect_requests_get( 'http://author/permalink3', """ <html class="h-entry"> <a class="u-url" href="/permalink3"></a> </html>""").InAnyOrder() self.mox.ReplayAll() original_post_discovery.refetch(source) relationship1 = SyndicatedPost.query_by_original( source, 'http://author/permalink1') self.assertIsNotNone(relationship1) self.assertEquals('https://fa.ke/post/url1', relationship1.syndication) relationship2 = SyndicatedPost.query_by_original( source, 'http://author/permalink2') # this shouldn't have changed self.assertIsNotNone(relationship2) self.assertEquals('https://fa.ke/post/url2', relationship2.syndication) relationship3 = SyndicatedPost.query_by_original( source, 'http://author/permalink3') self.assertIsNotNone(relationship3) self.assertIsNone(relationship3.syndication)
def test_additional_requests_do_not_require_rework(self): """Test that original post discovery fetches and stores all entries up front so that it does not have to reparse the author's h-feed for every new post. Test that original post discovery does the reverse lookup to scan author's h-feed for rel=syndication links """ for idx, activity in enumerate(self.activities): activity['object']['content'] = 'post content without backlinks' activity['object']['url'] = 'http://fa.ke/post/url%d' % (idx + 1) author_feed = """ <html class="h-feed"> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink1"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink2"></a> </div> <div class="h-entry"> <a class="u-url" href="http://author/post/permalink3"></a> </div> </html>""" source = self.sources[0] source.domain_urls = ['http://author'] self.expect_requests_get('http://author', author_feed) # first post is syndicated self.expect_requests_get( 'http://author/post/permalink1', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink1"></a> <a class="u-syndication" href="http://fa.ke/post/url1"></a> </div>""").InAnyOrder() # second post is syndicated self.expect_requests_get( 'http://author/post/permalink2', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink2"></a> <a class="u-syndication" href="http://fa.ke/post/url2"></a> </div>""").InAnyOrder() # third post is not syndicated self.expect_requests_get( 'http://author/post/permalink3', """ <div class="h-entry"> <a class="u-url" href="http://author/post/permalink3"></a> </div>""").InAnyOrder() # the second activity lookup should not make any HTTP requests # the third activity lookup will fetch the author's h-feed one more time self.expect_requests_get('http://author', author_feed).InAnyOrder() self.mox.ReplayAll() # first activity should trigger all the lookups and storage original_post_discovery.discover(source, self.activities[0]) self.assertEquals(['http://author/post/permalink1'], self.activities[0]['object']['upstreamDuplicates']) # make sure things are where we want them r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink1') self.assertEquals('https://fa.ke/post/url1', r.syndication) r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url1') self.assertEquals('http://author/post/permalink1', r.original) r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink2') self.assertEquals('https://fa.ke/post/url2', r.syndication) r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url2') self.assertEquals('http://author/post/permalink2', r.original) r = SyndicatedPost.query_by_original(source, 'http://author/post/permalink3') self.assertEquals(None, r.syndication) # second lookup should require no additional HTTP requests. # the second syndicated post should be linked up to the second permalink. original_post_discovery.discover(source, self.activities[1]) self.assertEquals(['http://author/post/permalink2'], self.activities[1]['object']['upstreamDuplicates']) # third activity lookup. # since we didn't find a back-link for the third syndicated post, # it should fetch the author's feed again, but seeing no new # posts, it should not follow any of the permalinks original_post_discovery.discover(source, self.activities[2]) # should have found no new syndication link self.assertNotIn('upstreamDuplicates', self.activities[2]['object']) # should have saved a blank to prevent subsequent checks of this # syndicated post from fetching the h-feed again r = SyndicatedPost.query_by_syndication(source, 'https://fa.ke/post/url3') self.assertEquals(None, r.original) # confirm that we do not fetch the h-feed again for the same # syndicated post original_post_discovery.discover(source, self.activities[2]) # should be no new syndication link self.assertNotIn('upstreamDuplicates', self.activities[2]['object'])