コード例 #1
0
ファイル: models.py プロジェクト: stedn/bridgy
  def resolve_profile_url(url, resolve=True):
    """Resolves a profile URL to be added to a source.

    Args:
      url: string
      resolve: boolean, whether to make HTTP requests to follow redirects, etc.

    Returns: string, resolved URL, or None
    """
    final, _, ok = util.get_webmention_target(url, resolve=resolve)
    if not ok:
      return None

    final = final.lower()
    if util.schemeless(final).startswith(util.schemeless(url.lower())):
      # redirected to a deeper path. use the original higher level URL. #652
      final = url

    # If final has a path segment check if root has a matching rel=me.
    match = re.match(r'^(https?://[^/]+)/.+', final)
    if match and resolve:
      root = match.group(1)
      try:
        mf2 = util.fetch_mf2(root)
        me_urls = mf2['rels'].get('me', [])
        if final in me_urls:
          final = root
      except requests.RequestException:
        logging.warning("Couldn't fetch %s, preserving path in %s",
                        root, final, stack_info=True)

    return final
コード例 #2
0
def process_entry(source,
                  permalink,
                  feed_entry,
                  refetch,
                  preexisting,
                  store_blanks=True):
    """Fetch and process an h-entry and save a new :class:`models.SyndicatedPost`.

  Args:
    source:
    permalink: url of the unprocessed post
    feed_entry: the h-feed version of the h-entry dict, often contains
      a partial version of the h-entry at the permalink
    refetch: boolean, whether to refetch and process entries we've seen before
    preexisting: list of previously discovered :class:`models.SyndicatedPost`\ s
      for this permalink
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Returns:
    a dict from syndicated url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # if the post has already been processed, do not add to the results
    # since this method only returns *newly* discovered relationships.
    if preexisting:
        # if we're refetching and this one is blank, do not return.
        # if there is a blank entry, it should be the one and only entry,
        # but go ahead and check 'all' of them to be safe.
        if not refetch:
            return {}
        synds = [s.syndication for s in preexisting if s.syndication]
        if synds:
            logger.debug(
                f'previously found relationship(s) for original {permalink}: {synds}'
            )

    # first try with the h-entry from the h-feed. if we find the syndication url
    # we're looking for, we don't have to fetch the permalink
    permalink, _, type_ok = util.get_webmention_target(permalink)
    usynd = feed_entry.get('properties', {}).get('syndication', [])
    usynd_urls = {url for url in usynd if isinstance(url, str)}
    if usynd_urls:
        logger.debug(
            f'u-syndication links on the h-feed h-entry: {usynd_urls}')
    results = _process_syndication_urls(source, permalink, usynd_urls,
                                        preexisting)
    success = True

    if results:
        source.updates['last_feed_syndication_url'] = util.now_fn()
    elif not source.last_feed_syndication_url or not feed_entry:
        # fetch the full permalink page if we think it might have more details
        mf2 = None
        try:
            if type_ok:
                logger.debug(f'fetching post permalink {permalink}')
                mf2 = util.fetch_mf2(permalink)
        except AssertionError:
            raise  # for unit tests
        except BaseException:
            # TODO limit the number of allowed failures
            logger.info(f'Could not fetch permalink {permalink}',
                        exc_info=True)
            success = False

        if mf2:
            syndication_urls = set()
            relsynd = mf2['rels'].get('syndication', [])
            if relsynd:
                logger.debug(f'rel-syndication links: {relsynd}')
            syndication_urls.update(url for url in relsynd
                                    if isinstance(url, str))
            # there should only be one h-entry on a permalink page, but
            # we'll check all of them just in case.
            for hentry in (item for item in mf2['items']
                           if 'h-entry' in item['type']):
                usynd = hentry.get('properties', {}).get('syndication', [])
                if usynd:
                    logger.debug(f'u-syndication links: {usynd}')
                syndication_urls.update(url for url in usynd
                                        if isinstance(url, str))
            results = _process_syndication_urls(source, permalink,
                                                syndication_urls, preexisting)

    # detect and delete SyndicatedPosts that were removed from the site
    if success:
        result_syndposts = list(itertools.chain(*results.values()))
        for syndpost in preexisting:
            if syndpost.syndication and syndpost not in result_syndposts:
                logger.info(
                    f'deleting relationship that disappeared: {syndpost}')
                syndpost.key.delete()
                preexisting.remove(syndpost)

    if not results:
        logger.debug(
            f'no syndication links from {permalink} to current source {source.label()}.'
        )
        results = {}
        if store_blanks and not preexisting:
            # remember that this post doesn't have syndication links for this
            # particular source
            logger.debug(
                f'saving empty relationship so that {permalink} will not be searched again'
            )
            SyndicatedPost.insert_original_blank(source, permalink)

    # only return results that are not in the preexisting list
    new_results = {}
    for syndurl, syndposts_for_url in results.items():
        for syndpost in syndposts_for_url:
            if syndpost not in preexisting:
                new_results.setdefault(syndurl, []).append(syndpost)

    if new_results:
        logger.debug(f'discovered relationships {new_results}')
    return new_results
コード例 #3
0
def _process_author(source, author_url, refetch=False, store_blanks=True):
    """Fetch the author's domain URL, and look for syndicated posts.

  Args:
    source: a subclass of :class:`models.Source`
    author_url: the author's homepage URL
    refetch: boolean, whether to refetch and process entries we've seen before
    store_blanks: boolean, whether we should store blank
      :class:`models.SyndicatedPost`\ s when we don't find a relationship

  Return:
    a dict of syndicated_url to a list of new :class:`models.SyndicatedPost`\ s
  """
    # for now use whether the url is a valid webmention target
    # as a proxy for whether it's worth searching it.
    author_url, _, ok = util.get_webmention_target(author_url)
    if not ok:
        return {}

    logger.debug(f'fetching author url {author_url}')
    try:
        author_mf2 = util.fetch_mf2(author_url)
    except AssertionError:
        raise  # for unit tests
    except BaseException:
        # TODO limit allowed failures, cache the author's h-feed url
        # or the # of times we've failed to fetch it
        logger.info(f'Could not fetch author url {author_url}', exc_info=True)
        return {}

    feeditems = _find_feed_items(author_mf2)

    # try rel=feeds and rel=alternates
    feed_urls = set()
    candidates = (author_mf2['rels'].get('feed', []) + [
        a.get('url') for a in author_mf2.get('alternates', [])
        if a.get('type') == MF2_HTML_MIME_TYPE
    ])
    for feed_url in candidates:
        # check that it's html, not too big, etc
        feed_url, _, feed_ok = util.get_webmention_target(feed_url)
        if feed_url == author_url:
            logger.debug('author url is the feed url, ignoring')
        elif not feed_ok:
            logger.debug("skipping feed since it's not HTML or otherwise bad")
        else:
            feed_urls.add(feed_url)

    for feed_url in feed_urls:
        try:
            logger.debug(f"fetching author's rel-feed {feed_url}")
            feed_mf2 = util.fetch_mf2(feed_url)
            feeditems = _merge_hfeeds(feeditems, _find_feed_items(feed_mf2))
            domain = util.domain_from_link(feed_url)
            if source.updates is not None and domain not in source.domains:
                domains = source.updates.setdefault('domains', source.domains)
                if domain not in domains:
                    logger.info(
                        f'rel-feed found new domain {domain}! adding to source'
                    )
                    domains.append(domain)

        except AssertionError:
            raise  # reraise assertions for unit tests
        except BaseException:
            logger.info(f'Could not fetch h-feed url {feed_url}.',
                        exc_info=True)

    # sort by dt-updated/dt-published
    def updated_or_published(item):
        props = microformats2.first_props(item.get('properties'))
        return props.get('updated') or props.get('published') or ''

    feeditems.sort(key=updated_or_published, reverse=True)

    permalink_to_entry = collections.OrderedDict()
    for child in feeditems:
        if 'h-entry' in child['type']:
            permalinks = child['properties'].get('url', [])
            if not permalinks:
                logger.debug('ignoring h-entry with no u-url!')
            for permalink in permalinks:
                if isinstance(permalink, str):
                    permalink_to_entry[permalink] = child
                else:
                    logger.warning(
                        f'unexpected non-string "url" property: {permalink}')

        max = (MAX_PERMALINK_FETCHES_BETA
               if source.is_beta_user() else MAX_PERMALINK_FETCHES)
        if len(permalink_to_entry) >= max:
            logger.info(f'Hit cap of {max} permalinks. Stopping.')
            break

    # query all preexisting permalinks at once, instead of once per link
    permalinks_list = list(permalink_to_entry.keys())
    # fetch the maximum allowed entries (currently 30) at a time
    preexisting_list = itertools.chain.from_iterable(
        SyndicatedPost.query(SyndicatedPost.original.IN(
            permalinks_list[i:i + MAX_ALLOWABLE_QUERIES]),
                             ancestor=source.key)
        for i in range(0, len(permalinks_list), MAX_ALLOWABLE_QUERIES))
    preexisting = {}
    for r in preexisting_list:
        preexisting.setdefault(r.original, []).append(r)

    results = {}
    for permalink, entry in permalink_to_entry.items():
        logger.debug(f'processing permalink: {permalink}')
        new_results = process_entry(source,
                                    permalink,
                                    entry,
                                    refetch,
                                    preexisting.get(permalink, []),
                                    store_blanks=store_blanks)
        for key, value in new_results.items():
            results.setdefault(key, []).extend(value)

    if source.updates is not None and results:
        # keep track of the last time we've seen rel=syndication urls for
        # this author. this helps us decide whether to refetch periodically
        # and look for updates.
        # Source will be saved at the end of each round of polling
        source.updates['last_syndication_url'] = util.now_fn()

    return results
コード例 #4
0
  def expand_target_urls(self, activity):
    """Expand the inReplyTo or object fields of an ActivityStreams object
    by fetching the original and looking for rel=syndication URLs.

    This method modifies the dict in place.

    Args:
      activity: an ActivityStreams dict of the activity being published
    """
    for field in ('inReplyTo', 'object'):
      # microformats2.json_to_object de-dupes, no need to do it here
      objs = activity.get(field)
      if not objs:
        continue

      if isinstance(objs, dict):
        objs = [objs]

      augmented = list(objs)
      for obj in objs:
        url = obj.get('url')
        if not url:
          continue

        parsed = urllib.parse.urlparse(url)
        # ignore home pages. https://github.com/snarfed/bridgy/issues/760
        if parsed.path in ('', '/'):
          continue

        # get_webmention_target weeds out silos and non-HTML targets
        # that we wouldn't want to download and parse
        url, _, ok = util.get_webmention_target(url)
        if not ok:
          continue

        logging.debug('expand_target_urls fetching field=%s, url=%s', field, url)
        try:
          mf2 = util.fetch_mf2(url)
        except AssertionError:
          raise  # for unit tests
        except BaseException:
          # it's not a big deal if we can't fetch an in-reply-to url
          logging.info('expand_target_urls could not fetch field=%s, url=%s',
                       field, url, stack_info=True)
          continue

        synd_urls = mf2['rels'].get('syndication', [])

        # look for syndication urls in the first h-entry
        queue = collections.deque(mf2.get('items', []))
        while queue:
          item = queue.popleft()
          item_types = set(item.get('type', []))
          if 'h-feed' in item_types and 'h-entry' not in item_types:
            queue.extend(item.get('children', []))
            continue

          # these can be urls or h-cites
          synd_urls += microformats2.get_string_urls(
            item.get('properties', {}).get('syndication', []))

        logging.debug('expand_target_urls found rel=syndication for url=%s: %r', url, synd_urls)
        augmented += [{'url': u} for u in synd_urls]

      activity[field] = augmented