def get_normalized_hostname(url, normalize_amp=True, strip_lang_subdomains=False, infer_redirection=True): if infer_redirection: url = resolve(url) if isinstance(url, SplitResult): splitted = url else: try: splitted = urlsplit(ensure_protocol(url)) except ValueError: return None if not splitted.hostname: return None hostname = splitted.hostname.lower() pattern = IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE hostname = pattern.sub('', hostname) if normalize_amp and hostname.startswith('amp-'): hostname = hostname[4:] hostname = decode_punycode(hostname) if strip_lang_subdomains: hostname = strip_lang_subdomains_from_netloc(hostname) return hostname
def convert_facebook_url_to_mobile(url): """ Function parsing the given facebook url and returning the same but for the mobile website. """ safe_url = ensure_protocol(url) has_protocol = safe_url == url scheme, netloc, path, query, fragment = urlsplit(safe_url) if 'facebook' not in netloc: raise Exception( 'ural.facebook.convert_facebook_url_to_mobile: %s is not a facebook url' % url) netloc = re.sub(MOBILE_REPLACE_RE, 'm.facebook.', netloc) result = (scheme, netloc, path, query, fragment) result = urlunsplit(result) if not has_protocol: result = result.split('://', 1)[-1] return result
def lru_stems(url, tld_aware=False): """ Function returning the parts of the given url in the hierarchical order (lru). Args: url (str): Target URL as a string. Returns: list: The lru, with a prefix identifying the type of each part. """ full_url = ensure_protocol(url) return lru_stems_from_parsed_url(urlsplit(full_url), tld_aware=tld_aware)
def resolve_ampproject_redirect(splitted): if (splitted.hostname and splitted.hostname.endswith('.ampproject.org') and AMPPROJECT_REDIRECTION_RE.search(splitted.path)): amp_redirected = 'https://' + AMPPROJECT_REDIRECTION_RE.sub( '', splitted.path) if splitted.query: amp_redirected += '?' + splitted.query if splitted.fragment: amp_redirected += '#' + splitted.fragment splitted = urlsplit(amp_redirected) return splitted
def parse_youtube_url(url, fix_common_mistakes=True): """ Function parsing the given url and returning either a YoutubeUser, YoutubeChannel, YoutubeVideo or None if nothing of information could be found. Args: url (str): Url to parse. fix_common_mistakes (bool, optional): Whether to fix common mistakes in Youtube urls as you can find them on the web. Defaults to `True`. """ # Inferring redirection url = infer_redirection(url) # Continuation urls m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url) if m: return YoutubeVideo(id=m.group(1)) # Parsing if isinstance(url, SplitResult): parsed = url else: url = ensure_protocol(url) parsed = urlsplit(url) if not is_youtube_url(parsed): return _, _, path, query, fragment = parsed # youtu.be if parsed.hostname.endswith('youtu.be'): if path.count('/') > 0: v = urlpathsplit(path)[0] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) return # Hidden video in fragment if fragment: mv = FRAGMENT_V_RE.match(fragment) if mv: v = mv.group(1) if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical video url if path == '/watch': mv = QUERY_V_RE.search(query) if mv: v = mv.group(1) if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Video file elif (path.startswith('/v/') or path.startswith('/video/') or path.startswith('/embed/')): v = urlpathsplit(path)[-1] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical user url elif path.startswith('/user/'): user = urlpathsplit(path)[1] return YoutubeUser(id=None, name=user) # Channel path? elif path.startswith('/c/'): name = urlpathsplit(path)[1] return YoutubeChannel(id=None, name=name) elif path.startswith('/channel/'): cid = urlpathsplit(path)[1] return YoutubeChannel(id=cid, name=None) else: path = path.rstrip('/') if path.count('/') == 1: return YoutubeChannel(id=None, name=path.lstrip('/'))
def is_shortened_url(url): hostname = urlsplit(ensure_protocol(url)).hostname return bool(TRIE.longest(reversed(hostname.split('.'))))
def normalize_url(url, unsplit=True, sort_query=True, strip_authentication=True, strip_trailing_slash=True, strip_index=True, strip_protocol=True, strip_irrelevant_subdomains=True, strip_lang_subdomains=False, strip_lang_query_items=False, strip_fragment='except-routing', normalize_amp=True, fix_common_mistakes=True, infer_redirection=True, quoted=True): """ Function normalizing the given url by stripping it of usually non-discriminant parts such as irrelevant query items or sub-domains etc. This is a very useful utility when attempting to match similar urls written slightly differently when shared on social media etc. Args: url (str): Target URL as a string. sort_query (bool, optional): Whether to sort query items or not. Defaults to `True`. strip_authentication (bool, optional): Whether to drop authentication. Defaults to `True`. strip_trailing_slash (bool, optional): Whether to drop trailing slash. Defaults to `False`. strip_index (bool, optional): Whether to drop trailing index at the end of the url. Defaults to `True`. strip_irrelevant_subdomains (bool, optional): Whether to strip irrelevant subdomains such as www etc. Default to True. strip_lang_subdomains (bool, optional): Whether to drop language subdomains (ex: 'fr-FR.lemonde.fr' to only 'lemonde.fr' because 'fr-FR' isn't a relevant subdomain, it indicates the language and the country). Defaults to `False`. strip_protocol (bool, optional): Whether to strip the url's protocol. Defaults to `True`. strip_fragment (bool|str, optional): Whether to drop non-routing fragment from the url? If set to `except-routing` will only drop non-routing fragment (i.e. fragments that do not contain a "/"). Defaults to `except-routing`. normalize_amp (bool, optional): Whether to attempt to normalize Google AMP urls. Defaults to True. fix_common_mistakes (bool, optional): Whether to attempt solving common mistakes. Defaults to True. infer_redirection (bool, optional): Whether to attempt resolving common redirects by leveraging well-known GET parameters. Defaults to `False`. quoted (bool, optional): Normalizing to quoted or unquoted. Defaults to True. Returns: string: The normalized url. """ original_url_arg = url if infer_redirection: url = resolve(url) if isinstance(url, SplitResult): has_protocol = bool(splitted.scheme) splitted = url else: has_protocol = PROTOCOL_RE.match(url) # Ensuring scheme so parsing works correctly if not has_protocol: url = 'http://' + url # Parsing try: splitted = urlsplit(url) except ValueError: return original_url_arg scheme, netloc, path, query, fragment = splitted # Fixing common mistakes if fix_common_mistakes: if query: query = re.sub(MISTAKES_RE, '&', query) # Handling punycode netloc = decode_punycode(netloc) # Dropping :80 & :443 if netloc.endswith(':80'): netloc = netloc[:-3] elif netloc.endswith(':443'): netloc = netloc[:-4] # Normalizing the path if path: trailing_slash = False if path.endswith('/') and len(path) > 1: trailing_slash = True path = normpath(path) if trailing_slash and not strip_trailing_slash: path = path + '/' # Handling Google AMP suffixes if normalize_amp: path = AMP_SUFFIXES_RE.sub('', path) # Dropping index: if strip_index: segments = path.rsplit('/', 1) if len(segments) != 0: last_segment = segments[-1] filename, ext = splitext(last_segment) if filename == 'index': segments.pop() path = '/'.join(segments) # Dropping irrelevant query items if query: domain_filter = None if splitted.hostname: domain_filter = next((f for d, f in PER_DOMAIN_QUERY_FILTERS if splitted.hostname.endswith(d)), None) qsl = parse_qsl(query, keep_blank_values=True) qsl = [ stringify_qs(item) for item in qsl if not should_strip_query_item( item, normalize_amp=normalize_amp, strip_lang_query_items=strip_lang_query_items, domain_filter=domain_filter) ] if sort_query: qsl = sorted(qsl) query = '&'.join(qsl) # Dropping fragment if it's not routing if fragment and strip_fragment: if strip_fragment is True or not should_strip_fragment(fragment): fragment = '' # Always dropping trailing slash with empty query & fragment if path == '/' and not fragment and not query: path = '' # Dropping irrelevant subdomains if strip_irrelevant_subdomains: netloc = re.sub( IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE, '', netloc) # Dropping language as subdomains if strip_lang_subdomains: netloc = strip_lang_subdomains_from_netloc(netloc) # Dropping scheme if strip_protocol or not has_protocol: scheme = '' # Dropping authentication if strip_authentication: netloc = netloc.split('@', 1)[-1] # Normalizing AMP subdomains if normalize_amp and netloc.startswith('amp-'): netloc = netloc[4:] # Dropping trailing slash if strip_trailing_slash and path.endswith('/'): path = path.rstrip('/') # Quoting or not if quoted: path = quote(path) query = quote(query, RESERVED_CHARACTERS) fragment = quote(fragment, SAFE_CHARACTERS) else: path = unquote(path) query = unquote(query) fragment = unquote(fragment) # Result result = SplitResult(scheme, netloc.lower(), path, query, fragment) if not unsplit: return result # TODO: check if works with `unsplit=False` if strip_protocol or not has_protocol: result = urlunsplit(result)[2:] else: result = urlunsplit(result) return result
def get_hostname(url): try: return urlsplit(ensure_protocol(url)).hostname or None except ValueError: return None