def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
    """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
    logging.info('starting posse post discovery with syndicated %s',
                 syndication_url)

    relationships = SyndicatedPost.query(
        SyndicatedPost.syndication == syndication_url,
        ancestor=source.key).fetch()

    if not relationships and fetch_hfeed:
        # a syndicated post we haven't seen before! fetch the author's URLs to see
        # if we can find it.
        #
        # TODO: Consider using the actor's url, with get_author_urls() as the
        # fallback in the future to support content from non-Bridgy users.
        results = {}
        for url in _get_author_urls(source):
            if url not in already_fetched_hfeeds:
                results.update(_process_author(source, url))
                already_fetched_hfeeds.add(url)
            else:
                logging.debug('skipping %s, already fetched this round', url)

        relationships = results.get(syndication_url, [])

    if not relationships:
        # No relationships were found. Remember that we've seen this
        # syndicated post to avoid reprocessing it every time
        logging.debug('posse post discovery found no relationship for %s',
                      syndication_url)
        if fetch_hfeed:
            SyndicatedPost.insert_syndication_blank(source, syndication_url)

    originals = [r.original for r in relationships if r.original]
    if originals:
        logging.debug('posse post discovery found relationship(s) %s -> %s',
                      syndication_url, originals)
    return originals
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed,
                          already_fetched_hfeeds):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: :class:`models.Source` subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
      trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
      author's feed if we don't have a previously stored
      relationship
    already_fetched_hfeeds: set, URLs we've already fetched in a
      previous iteration

  Return:
    sequence of string original post urls, possibly empty
  """
  logging.info('starting posse post discovery with syndicated %s',
               syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()

  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # TODO: Consider using the actor's url, with get_author_urls() as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in _get_author_urls(source):
      if url not in already_fetched_hfeeds:
        results.update(_process_author(source, url))
        already_fetched_hfeeds.add(url)
      else:
        logging.debug('skipping %s, already fetched this round', url)

    relationships = results.get(syndication_url, [])

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)

  originals = [r.original for r in relationships if r.original]
  if originals:
    logging.debug('posse post discovery found relationship(s) %s -> %s',
                  syndication_url, originals)
  return originals
Example #3
0
    def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
        """Make sure we don't insert duplicate blank entries"""

        SyndicatedPost.insert_syndication_blank(self.source,
                                                'http://silo/no-original')

        # make sure there's only one in the DB
        rs = SyndicatedPost.query(
            SyndicatedPost.syndication == 'http://silo/no-original',
            ancestor=self.source.key).fetch()

        self.assertCountEqual([None], [rel.original for rel in rs])
Example #4
0
  def test_get_or_insert_by_syndication_do_not_duplicate_blanks(self):
    """Make sure we don't insert duplicate blank entries"""

    SyndicatedPost.insert_syndication_blank(
      self.source, 'http://silo/no-original')

    # make sure there's only one in the DB
    rs = SyndicatedPost.query(
        SyndicatedPost.syndication == 'http://silo/no-original',
        ancestor=self.source.key
    ).fetch()

    self.assertItemsEqual([None], [rel.original for rel in rs])
def _posse_post_discovery(source, activity, syndication_url, fetch_hfeed):
  """Performs the actual meat of the posse-post-discover.

  Args:
    source: models.Source subclass
    activity: activity dict
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info('starting posse post discovery with syndicated %s', syndication_url)
  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's URLs to see
    # if we can find it.
    #
    # Use source.domain_urls for now; it seems more reliable than the
    # activity.actor.url (which depends on getting the right data back from
    # various APIs). Consider using the actor's url, with domain_urls as the
    # fallback in the future to support content from non-Bridgy users.
    results = {}
    for url in source.get_author_urls():
      results.update(_process_author(source, url))
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    if fetch_hfeed:
      SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(unicode(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity
def _posse_post_discovery(source, activity, author_url, syndication_url,
                          fetch_hfeed):
  """Performs the actual meat of the posse-post-discover. It was split
  out from discover() so that it can be done inside of a transaction.

  Args:
    source: models.Source subclass
    activity: activity dict
    author_url: author's url configured in their silo profile
    syndication_url: url of the syndicated copy for which we are
                     trying to find an original
    fetch_hfeed: boolean, whether or not to fetch and parse the
                 author's feed if we don't have a previously stored
                 relationship.

  Return:
    the activity, updated with original post urls if any are found
  """
  logging.info(
      'starting posse post discovery with author %s and syndicated %s',
      author_url, syndication_url)

  relationships = SyndicatedPost.query(
    SyndicatedPost.syndication == syndication_url,
    ancestor=source.key).fetch()
  if not relationships and fetch_hfeed:
    # a syndicated post we haven't seen before! fetch the author's
    # h-feed to see if we can find it.
    results = _process_author(source, author_url)
    relationships = results.get(syndication_url)

  if not relationships:
    # No relationships were found. Remember that we've seen this
    # syndicated post to avoid reprocessing it every time
    logging.debug('posse post discovery found no relationship for %s',
                  syndication_url)
    SyndicatedPost.insert_syndication_blank(source, syndication_url)
    return activity

  logging.debug('posse post discovery found relationship(s) %s -> %s',
                syndication_url,
                '; '.join(str(r.original) for r in relationships))

  obj = activity.get('object') or activity
  obj.setdefault('upstreamDuplicates', []).extend(
    r.original for r in relationships if r.original)

  return activity