def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds): """Performs the actual meat of the posse-post-discover. Args: source: :class:`models.Source` subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship already_fetched_hfeeds: set, URLs we've already fetched in a previous iteration Return: sequence of string original post urls, possibly empty """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # TODO: Consider using the actor's url, with get_author_urls() as the # fallback in the future to support content from non-Bridgy users. results = {} for url in _get_author_urls(source): if url not in already_fetched_hfeeds: results.update(_process_author(source, url)) already_fetched_hfeeds.add(url) else: logging.debug('skipping %s, already fetched this round', url) relationships = results.get(syndication_url, []) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) originals = [r.original for r in relationships if r.original] if originals: logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, originals) return originals
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed, already_fetched_hfeeds): """Performs the actual meat of the posse-post-discover. Args: source: :class:`models.Source` subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship already_fetched_hfeeds: set, URLs we've already fetched in a previous iteration Return: sequence of string original post urls, possibly empty """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # TODO: Consider using the actor's url, with get_author_urls() as the # fallback in the future to support content from non-Bridgy users. results = {} for url in _get_author_urls(source): if url not in already_fetched_hfeeds: results.update(_process_author(source, url)) already_fetched_hfeeds.add(url) else: logging.debug('skipping %s, already fetched this round', url) relationships = results.get(syndication_url, []) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) originals = [r.original for r in relationships if r.original] if originals: logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, originals) return originals
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank(self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key).fetch() self.assertCountEqual([None], [rel.original for rel in rs])
def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self): """Make sure we don't insert duplicate blank entries""" SyndicatedPost.insert_syndication_blank( self.source, 'http://silo/no-original') # make sure there's only one in the DB rs = SyndicatedPost.query( SyndicatedPost.syndication == 'http://silo/no-original', ancestor=self.source.key ).fetch() self.assertItemsEqual([None], [rel.original for rel in rs])
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. Args: source: models.Source subclass activity: activity dict syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info('starting posse post discovery with syndicated %s', syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's URLs to see # if we can find it. # # Use source.domain_urls for now; it seems more reliable than the # activity.actor.url (which depends on getting the right data back from # various APIs). Consider using the actor's url, with domain_urls as the # fallback in the future to support content from non-Bridgy users. results = {} for url in source.get_author_urls(): results.update(_process_author(source, url)) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) if fetch_hfeed: SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(unicode(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity
def _posse_post_discovery(source, activity, author_url, syndication_url, fetch_hfeed): """Performs the actual meat of the posse-post-discover. It was split out from discover() so that it can be done inside of a transaction. Args: source: models.Source subclass activity: activity dict author_url: author's url configured in their silo profile syndication_url: url of the syndicated copy for which we are trying to find an original fetch_hfeed: boolean, whether or not to fetch and parse the author's feed if we don't have a previously stored relationship. Return: the activity, updated with original post urls if any are found """ logging.info( 'starting posse post discovery with author %s and syndicated %s', author_url, syndication_url) relationships = SyndicatedPost.query( SyndicatedPost.syndication == syndication_url, ancestor=source.key).fetch() if not relationships and fetch_hfeed: # a syndicated post we haven't seen before! fetch the author's # h-feed to see if we can find it. results = _process_author(source, author_url) relationships = results.get(syndication_url) if not relationships: # No relationships were found. Remember that we've seen this # syndicated post to avoid reprocessing it every time logging.debug('posse post discovery found no relationship for %s', syndication_url) SyndicatedPost.insert_syndication_blank(source, syndication_url) return activity logging.debug('posse post discovery found relationship(s) %s -> %s', syndication_url, '; '.join(str(r.original) for r in relationships)) obj = activity.get('object') or activity obj.setdefault('upstreamDuplicates', []).extend( r.original for r in relationships if r.original) return activity