def infer_redirection(url): """ Function returning the url that the given url will redirect to. This is done by finding obvious hints in the GET parameters that the given url is in fact a redirection. Args: url (string): Target url. Returns: string: Redirected url or the original url if nothing was found. """ redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1) if len(redirection_split) > 1: return infer_redirection('https://' + redirection_split[1]) obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url) if obvious_redirect_match is not None: target = unquote(obvious_redirect_match.group(1)) if target.startswith('http://') or target.startswith('https://'): return target if target.startswith('/'): return urljoin(url, target) return url
def extract_url_from_facebook_link(url): m = URL_EXTRACT_RE.search(url) if m is None: return None return unquote(m.group(1))
def infer_redirection(url, recursive=True): """ Function returning the url that the given url will redirect to. This is done by finding obvious hints in the GET parameters that the given url is in fact a redirection. Args: url (string): Target url. recursive (bool): Whether to apply the function recursively until no redirection can be inferred. Defaults to `True`. Returns: string: Redirected url or the original url if nothing was found. """ redirection_split = REDIRECTION_DOMAINS_RE.split(url, 1) target = None if len(redirection_split) > 1: target = 'https://' + redirection_split[1] else: obvious_redirect_match = re.search(OBVIOUS_REDIRECTS_RE, url) if obvious_redirect_match is not None: potential_target = unquote(obvious_redirect_match.group(1)) if potential_target.startswith( 'http://') or potential_target.startswith('https://'): target = potential_target if potential_target.startswith('/'): target = urljoin(url, potential_target) if target is None: return url if recursive: return infer_redirection(target, recursive=True) return target
def normalize_url(url, unsplit=True, sort_query=True, strip_authentication=True, strip_trailing_slash=True, strip_index=True, strip_protocol=True, strip_irrelevant_subdomains=True, strip_lang_subdomains=False, strip_lang_query_items=False, strip_fragment='except-routing', normalize_amp=True, fix_common_mistakes=True, infer_redirection=True, quoted=True): """ Function normalizing the given url by stripping it of usually non-discriminant parts such as irrelevant query items or sub-domains etc. This is a very useful utility when attempting to match similar urls written slightly differently when shared on social media etc. Args: url (str): Target URL as a string. sort_query (bool, optional): Whether to sort query items or not. Defaults to `True`. strip_authentication (bool, optional): Whether to drop authentication. Defaults to `True`. strip_trailing_slash (bool, optional): Whether to drop trailing slash. Defaults to `False`. strip_index (bool, optional): Whether to drop trailing index at the end of the url. Defaults to `True`. strip_irrelevant_subdomains (bool, optional): Whether to strip irrelevant subdomains such as www etc. Default to True. strip_lang_subdomains (bool, optional): Whether to drop language subdomains (ex: 'fr-FR.lemonde.fr' to only 'lemonde.fr' because 'fr-FR' isn't a relevant subdomain, it indicates the language and the country). Defaults to `False`. strip_protocol (bool, optional): Whether to strip the url's protocol. Defaults to `True`. strip_fragment (bool|str, optional): Whether to drop non-routing fragment from the url? If set to `except-routing` will only drop non-routing fragment (i.e. fragments that do not contain a "/"). Defaults to `except-routing`. normalize_amp (bool, optional): Whether to attempt to normalize Google AMP urls. Defaults to True. fix_common_mistakes (bool, optional): Whether to attempt solving common mistakes. Defaults to True. infer_redirection (bool, optional): Whether to attempt resolving common redirects by leveraging well-known GET parameters. Defaults to `False`. quoted (bool, optional): Normalizing to quoted or unquoted. Defaults to True. Returns: string: The normalized url. """ original_url_arg = url if infer_redirection: url = resolve(url) if isinstance(url, SplitResult): has_protocol = bool(splitted.scheme) splitted = url else: has_protocol = PROTOCOL_RE.match(url) # Ensuring scheme so parsing works correctly if not has_protocol: url = 'http://' + url # Parsing try: splitted = urlsplit(url) except ValueError: return original_url_arg scheme, netloc, path, query, fragment = splitted # Fixing common mistakes if fix_common_mistakes: if query: query = re.sub(MISTAKES_RE, '&', query) # Handling punycode netloc = decode_punycode(netloc) # Dropping :80 & :443 if netloc.endswith(':80'): netloc = netloc[:-3] elif netloc.endswith(':443'): netloc = netloc[:-4] # Normalizing the path if path: trailing_slash = False if path.endswith('/') and len(path) > 1: trailing_slash = True path = normpath(path) if trailing_slash and not strip_trailing_slash: path = path + '/' # Handling Google AMP suffixes if normalize_amp: path = AMP_SUFFIXES_RE.sub('', path) # Dropping index: if strip_index: segments = path.rsplit('/', 1) if len(segments) != 0: last_segment = segments[-1] filename, ext = splitext(last_segment) if filename == 'index': segments.pop() path = '/'.join(segments) # Dropping irrelevant query items if query: domain_filter = None if splitted.hostname: domain_filter = next((f for d, f in PER_DOMAIN_QUERY_FILTERS if splitted.hostname.endswith(d)), None) qsl = parse_qsl(query, keep_blank_values=True) qsl = [ stringify_qs(item) for item in qsl if not should_strip_query_item( item, normalize_amp=normalize_amp, strip_lang_query_items=strip_lang_query_items, domain_filter=domain_filter) ] if sort_query: qsl = sorted(qsl) query = '&'.join(qsl) # Dropping fragment if it's not routing if fragment and strip_fragment: if strip_fragment is True or not should_strip_fragment(fragment): fragment = '' # Always dropping trailing slash with empty query & fragment if path == '/' and not fragment and not query: path = '' # Dropping irrelevant subdomains if strip_irrelevant_subdomains: netloc = re.sub( IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE, '', netloc) # Dropping language as subdomains if strip_lang_subdomains: netloc = strip_lang_subdomains_from_netloc(netloc) # Dropping scheme if strip_protocol or not has_protocol: scheme = '' # Dropping authentication if strip_authentication: netloc = netloc.split('@', 1)[-1] # Normalizing AMP subdomains if normalize_amp and netloc.startswith('amp-'): netloc = netloc[4:] # Dropping trailing slash if strip_trailing_slash and path.endswith('/'): path = path.rstrip('/') # Quoting or not if quoted: path = quote(path) query = quote(query, RESERVED_CHARACTERS) fragment = quote(fragment, SAFE_CHARACTERS) else: path = unquote(path) query = unquote(query) fragment = unquote(fragment) # Result result = SplitResult(scheme, netloc.lower(), path, query, fragment) if not unsplit: return result # TODO: check if works with `unsplit=False` if strip_protocol or not has_protocol: result = urlunsplit(result)[2:] else: result = urlunsplit(result) return result