def redirect_unwrap(val): """Removes our redirect wrapping from a URL, if it's there. url may be a string, dict, or list. dicts and lists are unwrapped recursively. Strings that aren't wrapped URLs are left unchanged. Args: url: string Returns: string, unwrapped url """ if isinstance(val, dict): return {k: redirect_unwrap(v) for k, v in val.items()} elif isinstance(val, list): return [redirect_unwrap(v) for v in val] elif isinstance(val, str): prefix = urllib.parse.urljoin(request.host_url, '/r/') if val.startswith(prefix): return util.follow_redirects(val[len(prefix):]).url elif val.startswith(request.host_url): domain = util.domain_from_link( urllib.parse.urlparse(val).path.strip('/')) return util.follow_redirects(domain).url return val
def follow_redirects(url, cache=True): """Wraps :func:`oauth_dropins.webutil.util.follow_redirects` with our settings. ...specifically memcache and REQUEST_HEADERS. """ return util.follow_redirects(url, cache=memcache if cache else None, headers=request_headers(url=url))
def follow_redirects(url, cache=True): """Wraps granary.source.follow_redirects and injects our settings. ...specifically memcache and USER_AGENT_HEADER. """ return util.follow_redirects(url, cache=memcache if cache else None, headers=USER_AGENT_HEADER)
def redirect_unwrap(val): """Removes our redirect wrapping from a URL, if it's there. url may be a string, dict, or list. dicts and lists are unwrapped recursively. Strings that aren't wrapped URLs are left unchanged. """ if isinstance(val, dict): return {k: redirect_unwrap(v) for k, v in val.items()} elif isinstance(val, list): return [redirect_unwrap(v) for v in val] elif isinstance(val, basestring): if val.startswith(REDIRECT_PREFIX): return val[len(REDIRECT_PREFIX):] elif val.startswith(appengine_config.HOST_URL): return util.follow_redirects( util.domain_from_link(urlparse.urlparse(val).path.strip('/')), cache=memcache).url return val
def follow_redirects(url): """Wraps :func:`oauth_dropins.webutil.util.follow_redirects` with our headers.""" return util.follow_redirects(url, headers=request_headers(url=url))
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [ t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None) ] candidates = tags + util.extract_links(content) + obj.get( 'upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [ match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content) ] candidates = set( filter( None, ( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith(u'…')))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in list(candidates): resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get( 'content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.add(resolved.url) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in( domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info( 'Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions
def original_post_discovery(activity, domains=None, cache=None, include_redirect_sources=True, **kwargs): """Discovers original post links. This is a variation on http://indiewebcamp.com/original-post-discovery . It differs in that it finds multiple candidate links instead of one, and it doesn't bother looking for MF2 (etc) markup because the silos don't let you input it. More background: https://github.com/snarfed/bridgy/issues/51#issuecomment-136018857 Original post candidates come from the upstreamDuplicates, attachments, and tags fields, as well as links and permashortlinks/permashortcitations in the text content. Args: activity: activity dict domains: optional sequence of domains. If provided, only links to these domains will be considered original and stored in upstreamDuplicates. (Permashortcitations are exempt.) cache: optional, a cache object for storing resolved URL redirects. Passed to follow_redirects(). include_redirect_sources: boolean, whether to include URLs that redirect as well as their final destination URLs kwargs: passed to requests.head() when following redirects Returns: ([string original post URLs], [string mention URLs]) tuple """ obj = activity.get('object') or activity content = obj.get('content', '').strip() # find all candidate URLs tags = [t.get('url') for t in obj.get('attachments', []) + obj.get('tags', []) if t.get('objectType') in ('article', 'mention', None)] candidates = tags + util.extract_links(content) + obj.get('upstreamDuplicates', []) # Permashortcitations (http://indiewebcamp.com/permashortcitation) are short # references to canonical copies of a given (usually syndicated) post, of # the form (DOMAIN PATH). We consider them an explicit original post link. candidates += [match.expand(r'http://\1/\2') for match in Source._PERMASHORTCITATION_RE.finditer(content)] candidates = set(util.dedupe_urls( util.clean_url(url) for url in candidates # heuristic: ellipsized URLs are probably incomplete, so omit them. if url and not url.endswith('...') and not url.endswith('…'))) # check for redirect and add their final urls redirects = {} # maps final URL to original URL for redirects for url in candidates: resolved = util.follow_redirects(url, cache=cache, **kwargs) if (resolved.url != url and resolved.headers.get('content-type', '').startswith('text/html')): redirects[resolved.url] = url candidates.update(redirects.keys()) # use domains to determine which URLs are original post links vs mentions originals = set() mentions = set() for url in util.dedupe_urls(candidates): if url in redirects.values(): # this is a redirected original URL. postpone and handle it when we hit # its final URL so that we know the final domain. continue domain = util.domain_from_link(url) which = (originals if not domains or util.domain_or_parent_in(domain, domains) else mentions) which.add(url) redirected_from = redirects.get(url) if redirected_from and include_redirect_sources: which.add(redirected_from) logging.info('Original post discovery found original posts %s, mentions %s', originals, mentions) return originals, mentions