Beispiel #1
0
 def fetch_mf2_func(url):
     if util.domain_or_parent_in(
             urllib.parse.urlparse(url).netloc, SILO_DOMAINS):
         return {
             'items': [{
                 'type': ['h-card'],
                 'properties': {
                     'url': [url]
                 }
             }]
         }
     return util.fetch_mf2(url, gateway=True)
Beispiel #2
0
 def fetch_mf2_func(url):
     if util.domain_or_parent_in(
             urlparse.urlparse(url).netloc, SILO_DOMAINS):
         return {
             'items': [{
                 'type': ['h-card'],
                 'properties': {
                     'url': [url]
                 }
             }]
         }
     _, doc = self._fetch(url)
     return mf2py.parse(doc=doc, url=url)
Beispiel #3
0
def host_url(path_query=None):
    domain = util.domain_from_link(request.host_url)
    base = (HOST_URL if util.domain_or_parent_in(domain, OTHER_DOMAINS) else
            request.host_url)
    return urllib.parse.urljoin(base, path_query)
Beispiel #4
0
def in_webmention_blocklist(domain):
    """Returns True if the domain or its root domain is in BLOCKLIST."""
    domain = domain.lower()
    return (util.domain_or_parent_in(domain, BLOCKLIST)
            or (not LOCAL and domain in LOCAL_HOSTS))
Beispiel #5
0
def in_webmention_blacklist(domain):
  """Returns True if the domain or its root domain is in BLACKLIST."""
  return util.domain_or_parent_in(domain.lower(), BLACKLIST)
Beispiel #6
0
def in_webmention_blacklist(domain):
  """Returns True if the domain or its root domain is in BLACKLIST."""
  return util.domain_or_parent_in(domain.lower(), BLACKLIST)
Beispiel #7
0
    def original_post_discovery(activity,
                                domains=None,
                                cache=None,
                                include_redirect_sources=True,
                                **kwargs):
        """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
        obj = activity.get('object') or activity
        content = obj.get('content', '').strip()

        # find all candidate URLs
        tags = [
            t.get('url')
            for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)
        ]
        candidates = tags + util.extract_links(content) + obj.get(
            'upstreamDuplicates', [])

        # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
        # references to canonical copies of a given (usually syndicated) post, of
        # the form (DOMAIN PATH). We consider them an explicit original post link.
        candidates += [
            match.expand(r'http://\1/\2')
            for match in Source._PERMASHORTCITATION_RE.finditer(content)
        ]

        candidates = set(
            filter(
                None,
                (
                    util.clean_url(url) for url in candidates
                    # heuristic: ellipsized URLs are probably incomplete, so omit them.
                    if url and not url.endswith('...')
                    and not url.endswith(u'…'))))

        # check for redirect and add their final urls
        redirects = {}  # maps final URL to original URL for redirects
        for url in list(candidates):
            resolved = util.follow_redirects(url, cache=cache, **kwargs)
            if (resolved.url != url and resolved.headers.get(
                    'content-type', '').startswith('text/html')):
                redirects[resolved.url] = url
                candidates.add(resolved.url)

        # use domains to determine which URLs are original post links vs mentions
        originals = set()
        mentions = set()
        for url in util.dedupe_urls(candidates):
            if url in redirects.values():
                # this is a redirected original URL. postpone and handle it when we hit
                # its final URL so that we know the final domain.
                continue
            domain = util.domain_from_link(url)
            which = (originals if not domains or util.domain_or_parent_in(
                domain, domains) else mentions)
            which.add(url)
            redirected_from = redirects.get(url)
            if redirected_from and include_redirect_sources:
                which.add(redirected_from)

        logging.info(
            'Original post discovery found original posts %s, mentions %s',
            originals, mentions)
        return originals, mentions
Beispiel #8
0
  def original_post_discovery(activity, domains=None, cache=None,
                              include_redirect_sources=True, **kwargs):
    """Discovers original post links.

    This is a variation on http://indiewebcamp.com/original-post-discovery . It
    differs in that it finds multiple candidate links instead of one, and it
    doesn't bother looking for MF2 (etc) markup because the silos don't let you
    input it. More background:
    https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857

    Original post candidates come from the upstreamDuplicates, attachments, and
    tags fields, as well as links and permashortlinks/permashortcitations in the
    text content.

    Args:
      activity: activity dict
      domains: optional sequence of domains. If provided, only links to these
        domains will be considered original and stored in upstreamDuplicates.
        (Permashortcitations are exempt.)
      cache: optional, a cache object for storing resolved URL redirects. Passed
        to follow_redirects().
      include_redirect_sources: boolean, whether to include URLs that redirect
        as well as their final destination URLs
      kwargs: passed to requests.head() when following redirects

    Returns:
      ([string original post URLs], [string mention URLs]) tuple
    """
    obj = activity.get('object') or activity
    content = obj.get('content', '').strip()

    # find all candidate URLs
    tags = [t.get('url') for t in obj.get('attachments', []) + obj.get('tags', [])
            if t.get('objectType') in ('article', 'mention', None)]
    candidates = tags + util.extract_links(content) + obj.get('upstreamDuplicates', [])

    # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short
    # references to canonical copies of a given (usually syndicated) post, of
    # the form (DOMAIN PATH). We consider them an explicit original post link.
    candidates += [match.expand(r'http://\1/\2') for match in
                   Source._PERMASHORTCITATION_RE.finditer(content)]

    candidates = set(util.dedupe_urls(
      util.clean_url(url) for url in candidates
      # heuristic: ellipsized URLs are probably incomplete, so omit them.
      if url and not url.endswith('...') and not url.endswith('…')))

    # check for redirect and add their final urls
    redirects = {}  # maps final URL to original URL for redirects
    for url in candidates:
      resolved = util.follow_redirects(url, cache=cache, **kwargs)
      if (resolved.url != url and
          resolved.headers.get('content-type', '').startswith('text/html')):
        redirects[resolved.url] = url

    candidates.update(redirects.keys())

    # use domains to determine which URLs are original post links vs mentions
    originals = set()
    mentions = set()
    for url in util.dedupe_urls(candidates):
      if url in redirects.values():
        # this is a redirected original URL. postpone and handle it when we hit
        # its final URL so that we know the final domain.
        continue
      domain = util.domain_from_link(url)
      which = (originals if not domains or util.domain_or_parent_in(domain, domains)
               else mentions)
      which.add(url)
      redirected_from = redirects.get(url)
      if redirected_from and include_redirect_sources:
        which.add(redirected_from)

    logging.info('Original post discovery found original posts %s, mentions %s',
                 originals, mentions)
    return originals, mentions
Beispiel #9
0
def host_url(handler):
    domain = util.domain_from_link(handler.request.host_url)
    return (HOST_URL if util.domain_or_parent_in(domain, OTHER_DOMAINS) else
            handler.request.host_url)
Beispiel #10
0
 def fetch_mf2_func(url):
   if util.domain_or_parent_in(urlparse.urlparse(url).netloc, SILO_DOMAINS):
     return {'items': [{'type': ['h-card'], 'properties': {'url': [url]}}]}
   _, doc = self._fetch(url)
   return mf2py.parse(doc=doc, url=url, img_with_alt=True)