def get_webmention_target(url, resolve=True, replace_test_domains=True): """Resolves a URL and decides whether we should try to send it a webmention. Note that this ignores failed HTTP requests, ie the boolean in the returned tuple will be true! TODO: check callers and reconsider this. Args: url: string resolve: whether to follow redirects replace_test_domains: whether to replace test user domains with localhost Returns: (string url, string pretty domain, boolean) tuple. The boolean is True if we should send a webmention, False otherwise, e.g. if it's a bad URL, not text/html, or in the blacklist. """ url = util.clean_url(url) try: domain = domain_from_link(url).lower() except BaseException: logging.info('Dropping bad URL %s.', url) return url, None, False send = True if resolve: # this follows *all* redirects, until the end resolved = follow_redirects(url, cache=memcache) send = resolved.headers.get('content-type', '').startswith('text/html') url, domain, _ = get_webmention_target( resolved.url, resolve=False, replace_test_domains=replace_test_domains) send = send and domain and not in_webmention_blacklist(domain) if replace_test_domains: url = replace_test_domains_with_localhost(url) return url, domain, send
def get_webmention_target(url, resolve=True, replace_test_domains=True): """Resolves a URL and decides whether we should try to send it a webmention. Note that this ignores failed HTTP requests, ie the boolean in the returned tuple will be True! TODO: check callers and reconsider this. Args: url: string resolve: whether to follow redirects replace_test_domains: whether to replace test user domains with localhost Returns: (string url, string pretty domain, boolean) tuple. The boolean is True if we should send a webmention, False otherwise, e.g. if it's a bad URL, not text/html, or in the blocklist. """ url = util.clean_url(url) try: domain = domain_from_link(url).lower() except BaseException: logging.info('Dropping bad URL %s.', url) return url, None, False if domain in ('puzzleadventura.com', 'sweetgamesbox.com'): return url, domain, False send = True if resolve: # this follows *all* redirects, until the end resolved = follow_redirects(url) html = resolved.headers.get('content-type', '').startswith('text/html') send = html and resolved.status_code != util.HTTP_RESPONSE_TOO_BIG_STATUS_CODE url, domain, _ = get_webmention_target( resolved.url, resolve=False, replace_test_domains=replace_test_domains) scheme = urllib.parse.urlparse(url).scheme # require http or https send = (send and domain and scheme in ('http', 'https') and not in_webmention_blocklist(domain)) if replace_test_domains: url = replace_test_domains_with_localhost(url) return url, domain, send
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [ t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None) ] candidates = tags + util.extract_links(content) + obj.get( 'upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [ match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content) ] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith(u'…')))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get( 'content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in( domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info( 'Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get("object") or activity content = obj.get("content", "").strip() # find all candidate URLs tags = [ t.get("url") for t in obj.get("attachments", []) + obj.get("tags", []) if t.get("objectType") in ("article", "mention", None) ] candidates = tags + util.extract_links(content) + obj.get("upstreamDuplicates", []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [match.expand(r"http://\1/\2") for match in Source._PERMASHORTCITATION_RE.finditer(content)] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith("...") and not url.endswith(u"…") ), ) ) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = follow_redirects(url, cache=cache, **kwargs) if resolved.url != url and resolved.headers.get("content-type", "").startswith("text/html"): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue which = originals if not domains or util.domain_from_link(url) in domains else mentions which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info("Original post discovery found original posts %s, mentions %s", originals, mentions) return originals, mentions
kwargs.setdefault("timeout", appengine_config.HTTP_TIMEOUT) resolved = requests.head(url, allow_redirects=True, **kwargs) resolved.raise_for_status() if resolved.url != url: logging.debug("Resolved %s to %s", url, resolved.url) cache_time = 0 # forever except AssertionError: raise except BaseException, e: logging.warning("Couldn't resolve URL %s : %s", url, e) resolved = requests.Response() resolved.url = url resolved.status_code = 499 # not standard. i made this up. cache_time = FAILED_RESOLVE_URL_CACHE_TIME content_type = resolved.headers.get("content-type") if not content_type: type, _ = mimetypes.guess_type(resolved.url) resolved.headers["content-type"] = type or "text/html" refresh = resolved.headers.get("refresh") if refresh: for part in refresh.split(";"): if part.strip().startswith("url="): return follow_redirects(part.strip()[4:], cache=cache, **kwargs) resolved.url = util.clean_url(resolved.url) if cache is not None: cache.set_multi({cache_key: resolved, "R " + resolved.url: resolved}, time=cache_time) return resolved
except AssertionError: raise except BaseException, e: logging.warning("Couldn't resolve URL %s : %s", url, e) resolved = requests.Response() resolved.url = url resolved.status_code = 499 # not standard. i made this up. cache_time = FAILED_RESOLVE_URL_CACHE_TIME content_type = resolved.headers.get('content-type') if not content_type: type, _ = mimetypes.guess_type(resolved.url) resolved.headers['content-type'] = type or 'text/html' refresh = resolved.headers.get('refresh') if refresh: for part in refresh.split(';'): if part.strip().startswith('url='): return follow_redirects(part.strip()[4:], cache=cache, **kwargs) resolved.url = util.clean_url(resolved.url) if cache is not None: cache.set_multi({ cache_key: resolved, 'R ' + resolved.url: resolved }, time=cache_time) return resolved