def get_normalized_hostname(url, normalize_amp=True, strip_lang_subdomains=False, infer_redirection=True): if infer_redirection: url = resolve(url) if isinstance(url, SplitResult): splitted = url else: try: splitted = urlsplit(ensure_protocol(url)) except ValueError: return None if not splitted.hostname: return None hostname = splitted.hostname.lower() pattern = IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE hostname = pattern.sub('', hostname) if normalize_amp and hostname.startswith('amp-'): hostname = hostname[4:] hostname = decode_punycode(hostname) if strip_lang_subdomains: hostname = strip_lang_subdomains_from_netloc(hostname) return hostname
def convert_facebook_url_to_mobile(url): """ Function parsing the given facebook url and returning the same but for the mobile website. """ safe_url = ensure_protocol(url) has_protocol = safe_url == url scheme, netloc, path, query, fragment = urlsplit(safe_url) if 'facebook' not in netloc: raise Exception( 'ural.facebook.convert_facebook_url_to_mobile: %s is not a facebook url' % url) netloc = re.sub(MOBILE_REPLACE_RE, 'm.facebook.', netloc) result = (scheme, netloc, path, query, fragment) result = urlunsplit(result) if not has_protocol: result = result.split('://', 1)[-1] return result
def lru_stems(url, tld_aware=False): """ Function returning the parts of the given url in the hierarchical order (lru). Args: url (str): Target URL as a string. Returns: list: The lru, with a prefix identifying the type of each part. """ full_url = ensure_protocol(url) return lru_stems_from_parsed_url(urlsplit(full_url), tld_aware=tld_aware)
def lru_from_url(url, default_protocol='http'): """ Function returning the parts of the given url in the hierarchical order (lru). Args: url (str): Target URL as a string. default_protocol (str, optional): Protocol to add if there is none. Defaults to `'http'`. Returns: list: The lru, with a prefix identifying the type of each part. """ full_url = ensure_protocol(url, protocol=default_protocol) return parsed_url_to_lru(urlsplit(full_url))
def normalized_lru_from_url(url, default_protocol='http', **kwargs): """ Function normalizing the given url by stripping it of usually non-discriminant parts such as irrelevant query items or sub-domains, and returning its parts in the hierarchical order (lru). Args: url (str): Target URL as a string. sort_query (bool, optional): Whether to sort query items or not. Defaults to `True`. strip_authentication (bool, optional): Whether to drop authentication. Defaults to `True`. strip_trailing_slash (bool, optional): Whether to drop trailing slash. Defaults to `False`. strip_index (bool, optional): Whether to drop trailing index at the end of the url. Defaults to `True`. Returns: list: The normalized lru, with a prefix identifying the type of each part. """ full_url = ensure_protocol(url, protocol=default_protocol) return parsed_url_to_lru(normalize_url(full_url, parsed=True, **kwargs))
def parse_youtube_url(url, fix_common_mistakes=True): """ Function parsing the given url and returning either a YoutubeUser, YoutubeChannel, YoutubeVideo or None if nothing of information could be found. Args: url (str): Url to parse. fix_common_mistakes (bool, optional): Whether to fix common mistakes in Youtube urls as you can find them on the web. Defaults to `True`. """ # Inferring redirection url = infer_redirection(url) # Continuation urls m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url) if m: return YoutubeVideo(id=m.group(1)) # Parsing if isinstance(url, SplitResult): parsed = url else: url = ensure_protocol(url) parsed = urlsplit(url) if not is_youtube_url(parsed): return _, _, path, query, fragment = parsed # youtu.be if parsed.hostname.endswith('youtu.be'): if path.count('/') > 0: v = urlpathsplit(path)[0] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) return # Hidden video in fragment if fragment: mv = FRAGMENT_V_RE.match(fragment) if mv: v = mv.group(1) if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical video url if path == '/watch': mv = QUERY_V_RE.search(query) if mv: v = mv.group(1) if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Video file elif (path.startswith('/v/') or path.startswith('/video/') or path.startswith('/embed/')): v = urlpathsplit(path)[-1] if fix_common_mistakes: v = v[:11] if not is_youtube_video_id(v): return return YoutubeVideo(id=v) # Typical user url elif path.startswith('/user/'): user = urlpathsplit(path)[1] return YoutubeUser(id=None, name=user) # Channel path? elif path.startswith('/c/'): name = urlpathsplit(path)[1] return YoutubeChannel(id=None, name=name) elif path.startswith('/channel/'): cid = urlpathsplit(path)[1] return YoutubeChannel(id=cid, name=None) else: path = path.rstrip('/') if path.count('/') == 1: return YoutubeChannel(id=None, name=path.lstrip('/'))
def is_shortened_url(url): hostname = urlsplit(ensure_protocol(url)).hostname return bool(TRIE.longest(reversed(hostname.split('.'))))
def normalized_lru_stems(url, tld_aware=False, **kwargs): full_url = ensure_protocol(url) parsed_url = normalize_url(full_url, unsplit=False, **kwargs) return lru_stems_from_parsed_url(parsed_url, tld_aware=tld_aware)
def get_hostname(url): try: return urlsplit(ensure_protocol(url)).hostname or None except ValueError: return None
from ural.normalize_url import normalize_url from ural.ensure_protocol import ensure_protocol with open('./scripts/data/amp-urls.txt') as f: for url in f: url = url.strip()[1:-1] url = normalize_url(url) print(ensure_protocol(url))