def _process_syndication_urls(source, permalink, syndication_urls): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new SyndicatedPost in the db. Args: source: a models.Source subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication_urls """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for syndication_url in syndication_urls: # follow redirects to give us the canonical syndication url -- # gives the best chance of finding a match. syndication_url = util.follow_redirects(syndication_url).url # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) syndication_url = source.canonicalize_syndication_url(syndication_url) # check that the syndicated url belongs to this source TODO save future # lookups by saving results for other sources too (note: query the # appropriate source subclass by author.domains, rather than # author.domain_urls) if util.domain_from_link(syndication_url) == source.AS_CLASS.DOMAIN: logging.debug('saving discovered relationship %s -> %s', syndication_url, permalink) relationship = SyndicatedPost.insert( source, syndication=syndication_url, original=permalink) results.setdefault(syndication_url, []).append(relationship) return results
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs))
def test_insert_no_duplicates(self): """Make sure we don't insert duplicate entries""" r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/post/url') self.assertIsNotNone(r) self.assertEqual('http://original/post/url', r.original) # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', SyndicatedPost.original == 'http://original/post/url', ancestor=self.source.key ).fetch() self.assertEqual(1, len(rs))
def test_retry(self): self.assertEqual([], self.taskqueue_stub.GetTasks('propagate')) source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = \ testutil.NOW - datetime.timedelta(minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json.dumps({'object': {'url': 'https://fa.ke/1'}}), json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}), json.dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') # cached webmention endpoint memcache.set('W https skipped /', 'asdf') key = resp.key.urlsafe() response = app.application.get_response( '/retry', method='POST', body=native_str(urllib.parse.urlencode({'key': key}))) self.assertEquals(302, response.status_int) self.assertEquals(source.bridgy_url(self.handler), response.headers['Location'].split('#')[0]) params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0]) self.assertEqual(key, params['response_key']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertItemsEqual( ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertIsNone(memcache.get('W https skipped /')) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
def test_retry(self): self.assertEqual([], self.taskqueue_stub.GetTasks('propagate')) source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = \ testutil.NOW - datetime.timedelta(minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json.dumps({'object': {'url': 'https://fa.ke/1'}}), json.dumps({'url': 'https://fa.ke/2', 'object': {'unused': 'ok'}}), json.dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') # cached webmention endpoint memcache.set('W https skipped /', 'asdf') key = resp.key.urlsafe() response = app.application.get_response( '/retry', method='POST', body=urllib.urlencode({'key': key})) self.assertEquals(302, response.status_int) self.assertEquals(source.bridgy_url(self.handler), response.headers['Location'].split('#')[0]) params = testutil.get_task_params(self.taskqueue_stub.GetTasks('propagate')[0]) self.assertEqual(key, params['response_key']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertItemsEqual( ['http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3'], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertIsNone(memcache.get('W https skipped /')) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert(self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEqual('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertEqual(1, len(rs)) self.assertEqual('http://original/newly-discovered', rs[0].original) self.assertEqual('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert(self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEqual('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key).fetch() self.assertCountEqual([ 'http://original/post/url', 'http://original/another/post', 'http://original/different/url' ], [rel.original for rel in rs])
def _process_syndication_urls(source, permalink, syndication_urls, preexisting): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new :class:`models.SyndicatedPost` in the db. Args: source: a :class:`models.Source` subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication urls preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s Returns: dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for url in syndication_urls: # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) url = source.canonicalize_url(url) if not url: continue # TODO: save future lookups by saving results for other sources too (note: # query the appropriate source subclass by author.domains, rather than # author.domain_urls) # # we may have already seen this relationship, save a DB lookup by # finding it in the preexisting list relationship = next( (sp for sp in preexisting if sp.syndication == url and sp.original == permalink), None) if not relationship: logging.debug('saving discovered relationship %s -> %s', url, permalink) relationship = SyndicatedPost.insert(source, syndication=url, original=permalink) results.setdefault(url, []).append(relationship) return results
def test_insert_replaces_blanks(self): """Make sure we replace original=None with original=something when it is discovered""" # add a blank for the original too SyndicatedPost.insert_original_blank( self.source, 'http://original/newly-discovered') self.assertTrue( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertTrue( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get()) r = SyndicatedPost.insert( self.source, 'http://silo/no-original', 'http://original/newly-discovered') self.assertIsNotNone(r) self.assertEquals('http://original/newly-discovered', r.original) # make sure it's in NDB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertEquals(1, len(rs)) self.assertEquals('http://original/newly-discovered', rs[0].original) self.assertEquals('http://silo/no-original', rs[0].syndication) # and the blanks have been removed self.assertFalse( SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', SyndicatedPost.original == None, ancestor=self.source.key).get()) self.assertFalse( SyndicatedPost.query( SyndicatedPost.original == 'http://original/newly-discovered', SyndicatedPost.syndication == None, ancestor=self.source.key).get())
def _process_syndication_urls(source, permalink, syndication_urls, preexisting): """Process a list of syndication URLs looking for one that matches the current source. If one is found, stores a new :class:`models.SyndicatedPost` in the db. Args: source: a :class:`models.Source` subclass permalink: a string. the current h-entry permalink syndication_urls: a collection of strings. the unfitered list of syndication urls preexisting: a list of previously discovered :class:`models.SyndicatedPost`\ s Returns: dict mapping string syndication url to list of :class:`models.SyndicatedPost`\ s """ results = {} # save the results (or lack thereof) to the db, and put them in a # map for immediate use for url in syndication_urls: # source-specific logic to standardize the URL. (e.g., replace facebook # username with numeric id) url = source.canonicalize_url(url) if not url: continue # TODO: save future lookups by saving results for other sources too (note: # query the appropriate source subclass by author.domains, rather than # author.domain_urls) # # we may have already seen this relationship, save a DB lookup by # finding it in the preexisting list relationship = next((sp for sp in preexisting if sp.syndication == url and sp.original == permalink), None) if not relationship: logging.debug('saving discovered relationship %s -> %s', url, permalink) relationship = SyndicatedPost.insert( source, syndication=url, original=permalink) results.setdefault(url, []).append(relationship) return results
def test_insert_auguments_existing(self): """Make sure we add newly discovered urls for a given syndication url, rather than overwrite them """ r = SyndicatedPost.insert( self.source, 'http://silo/post/url', 'http://original/different/url') self.assertIsNotNone(r) self.assertEquals('http://original/different/url', r.original) # make sure they're both in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/post/url', ancestor=self.source.key ).fetch() self.assertItemsEqual(['http://original/post/url', 'http://original/another/post', 'http://original/different/url'], [rel.original for rel in rs])
def test_retry(self): source = self.sources[0] source.domain_urls = ['http://orig'] source.last_hfeed_refetch = last_hfeed_refetch = testutil.NOW - timedelta( minutes=1) source.put() resp = self.responses[0] resp.status = 'complete' resp.unsent = ['http://unsent'] resp.sent = ['http://sent'] resp.error = ['http://error'] resp.failed = ['http://failed'] resp.skipped = ['https://skipped'] # SyndicatedPost with new target URLs resp.activities_json = [ json_dumps({'object': { 'url': 'https://fa.ke/1' }}), json_dumps({ 'url': 'https://fa.ke/2', 'object': { 'unused': 'ok' } }), json_dumps({'url': 'https://fa.ke/3'}), ] resp.put() SyndicatedPost.insert(source, 'https://fa.ke/1', 'https://orig/1') SyndicatedPost.insert(source, 'https://fa.ke/2', 'http://orig/2') SyndicatedPost.insert(source, 'https://fa.ke/3', 'http://orig/3') key = resp.key.urlsafe().decode() self.expect_task('propagate', response_key=key) self.mox.ReplayAll() # cached webmention endpoint util.webmention_endpoint_cache['W https skipped /'] = 'asdf' response = self.client.post('/retry', data={'key': key}) self.assertEqual(302, response.status_code) self.assertEqual(self.source_bridgy_url, response.headers['Location']) # status and URLs should be refreshed got = resp.key.get() self.assertEqual('new', got.status) self.assertCountEqual([ 'http://unsent/', 'http://sent/', 'https://skipped/', 'http://error/', 'http://failed/', 'https://orig/1', 'http://orig/2', 'http://orig/3' ], got.unsent) for field in got.sent, got.skipped, got.error, got.failed: self.assertEqual([], field) # webmention endpoints for URL domains should be refreshed self.assertNotIn('W https skipped /', util.webmention_endpoint_cache) # shouldn't have refetched h-feed self.assertEqual(last_hfeed_refetch, source.key.get().last_hfeed_refetch)