Beispiel #1
0
def get_normalized_hostname(url,
                            normalize_amp=True,
                            strip_lang_subdomains=False,
                            infer_redirection=True):

    if infer_redirection:
        url = resolve(url)

    if isinstance(url, SplitResult):
        splitted = url
    else:
        try:
            splitted = urlsplit(ensure_protocol(url))
        except ValueError:
            return None

    if not splitted.hostname:
        return None

    hostname = splitted.hostname.lower()

    pattern = IRRELEVANT_SUBDOMAIN_AMP_RE if normalize_amp else IRRELEVANT_SUBDOMAIN_RE

    hostname = pattern.sub('', hostname)

    if normalize_amp and hostname.startswith('amp-'):
        hostname = hostname[4:]

    hostname = decode_punycode(hostname)

    if strip_lang_subdomains:
        hostname = strip_lang_subdomains_from_netloc(hostname)

    return hostname
Beispiel #2
0
def convert_facebook_url_to_mobile(url):
    """
    Function parsing the given facebook url and returning the same but for
    the mobile website.
    """
    safe_url = ensure_protocol(url)

    has_protocol = safe_url == url

    scheme, netloc, path, query, fragment = urlsplit(safe_url)

    if 'facebook' not in netloc:
        raise Exception(
            'ural.facebook.convert_facebook_url_to_mobile: %s is not a facebook url'
            % url)

    netloc = re.sub(MOBILE_REPLACE_RE, 'm.facebook.', netloc)

    result = (scheme, netloc, path, query, fragment)

    result = urlunsplit(result)

    if not has_protocol:
        result = result.split('://', 1)[-1]

    return result
Beispiel #3
0
def lru_stems(url, tld_aware=False):
    """
    Function returning the parts of the given url in the hierarchical order (lru).

    Args:
        url (str): Target URL as a string.

    Returns:
        list: The lru, with a prefix identifying the type of each part.
    """

    full_url = ensure_protocol(url)
    return lru_stems_from_parsed_url(urlsplit(full_url), tld_aware=tld_aware)
Beispiel #4
0
def lru_from_url(url, default_protocol='http'):
    """
    Function returning the parts of the given url in the hierarchical order (lru).

    Args:
        url (str): Target URL as a string.
        default_protocol (str, optional): Protocol to add if there is none.
            Defaults to `'http'`.

    Returns:
        list: The lru, with a prefix identifying the type of each part.
    """

    full_url = ensure_protocol(url, protocol=default_protocol)
    return parsed_url_to_lru(urlsplit(full_url))
Beispiel #5
0
def normalized_lru_from_url(url, default_protocol='http', **kwargs):
    """
    Function normalizing the given url by stripping it of usually
    non-discriminant parts such as irrelevant query items or sub-domains, and
    returning its parts in the hierarchical order (lru).

    Args:
        url (str): Target URL as a string.
        sort_query (bool, optional): Whether to sort query items or not.
            Defaults to `True`.
        strip_authentication (bool, optional): Whether to drop authentication.
            Defaults to `True`.
        strip_trailing_slash (bool, optional): Whether to drop trailing slash.
            Defaults to `False`.
        strip_index (bool, optional): Whether to drop trailing index at the end
            of the url. Defaults to `True`.

    Returns:
        list: The normalized lru, with a prefix identifying the type of each part.
    """

    full_url = ensure_protocol(url, protocol=default_protocol)
    return parsed_url_to_lru(normalize_url(full_url, parsed=True, **kwargs))
Beispiel #6
0
def parse_youtube_url(url, fix_common_mistakes=True):
    """
    Function parsing the given url and returning either a YoutubeUser,
    YoutubeChannel, YoutubeVideo or None if nothing of information could be
    found.

    Args:
        url (str): Url to parse.
        fix_common_mistakes (bool, optional): Whether to fix common mistakes
            in Youtube urls as you can find them on the web. Defaults to `True`.

    """

    # Inferring redirection
    url = infer_redirection(url)

    # Continuation urls
    m = NEXT_V_RE.search(url) or NESTED_NEXT_V_RE.search(url)

    if m:
        return YoutubeVideo(id=m.group(1))

    # Parsing
    if isinstance(url, SplitResult):
        parsed = url
    else:
        url = ensure_protocol(url)
        parsed = urlsplit(url)

    if not is_youtube_url(parsed):
        return

    _, _, path, query, fragment = parsed

    # youtu.be
    if parsed.hostname.endswith('youtu.be'):

        if path.count('/') > 0:
            v = urlpathsplit(path)[0]

            if fix_common_mistakes:
                v = v[:11]

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

        return

    # Hidden video in fragment
    if fragment:
        mv = FRAGMENT_V_RE.match(fragment)

        if mv:
            v = mv.group(1)

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

    # Typical video url
    if path == '/watch':
        mv = QUERY_V_RE.search(query)

        if mv:
            v = mv.group(1)

            if fix_common_mistakes:
                v = v[:11]

            if not is_youtube_video_id(v):
                return

            return YoutubeVideo(id=v)

    # Video file
    elif (path.startswith('/v/') or path.startswith('/video/')
          or path.startswith('/embed/')):
        v = urlpathsplit(path)[-1]

        if fix_common_mistakes:
            v = v[:11]

        if not is_youtube_video_id(v):
            return

        return YoutubeVideo(id=v)

    # Typical user url
    elif path.startswith('/user/'):
        user = urlpathsplit(path)[1]

        return YoutubeUser(id=None, name=user)

    # Channel path?
    elif path.startswith('/c/'):
        name = urlpathsplit(path)[1]

        return YoutubeChannel(id=None, name=name)

    elif path.startswith('/channel/'):
        cid = urlpathsplit(path)[1]

        return YoutubeChannel(id=cid, name=None)

    else:
        path = path.rstrip('/')
        if path.count('/') == 1:
            return YoutubeChannel(id=None, name=path.lstrip('/'))
Beispiel #7
0
def is_shortened_url(url):
    hostname = urlsplit(ensure_protocol(url)).hostname

    return bool(TRIE.longest(reversed(hostname.split('.'))))
Beispiel #8
0
def normalized_lru_stems(url, tld_aware=False, **kwargs):
    full_url = ensure_protocol(url)
    parsed_url = normalize_url(full_url, unsplit=False, **kwargs)
    return lru_stems_from_parsed_url(parsed_url, tld_aware=tld_aware)
Beispiel #9
0
def get_hostname(url):
    try:
        return urlsplit(ensure_protocol(url)).hostname or None
    except ValueError:
        return None
Beispiel #10
0
from ural.normalize_url import normalize_url
from ural.ensure_protocol import ensure_protocol

with open('./scripts/data/amp-urls.txt') as f:
    for url in f:
        url = url.strip()[1:-1]
        url = normalize_url(url)
        print(ensure_protocol(url))