Example #1
0
def normalize(uristr):
    """Translate the given URI into a normalized form."""
    uristr = uristr.encode('utf-8')

    # Strip proxy prefix for proxied URLs
    for scheme in URL_SCHEMES:
        if uristr.startswith(VIA_PREFIX + scheme + ':'):
            uristr = uristr[len(VIA_PREFIX):]
            break

    # Try to extract the scheme
    uri = urlparse.urlsplit(uristr)

    # If this isn't a URL, we don't perform any normalization
    if uri.scheme.lower() not in URL_SCHEMES:
        return text_type(uristr, 'utf-8')

    # Don't perform normalization on URLs with no hostname.
    if uri.hostname is None:
        return text_type(uristr, 'utf-8')

    scheme = _normalize_scheme(uri)
    netloc = _normalize_netloc(uri)
    path = _normalize_path(uri)
    query = _normalize_query(uri)
    fragment = None

    uri = urlparse.SplitResult(scheme, netloc, path, query, fragment)

    return text_type(uri.geturl(), 'utf-8')
Example #2
0
File: uri.py Project: JJediny/h
def normalize(uristr):
    """Translate the given URI into a normalized form."""
    uristr = uristr.encode('utf-8')

    # Strip proxy prefix for proxied URLs
    for scheme in URL_SCHEMES:
        if uristr.startswith(VIA_PREFIX + scheme + ':'):
            uristr = uristr[len(VIA_PREFIX):]
            break

    # Try to extract the scheme
    uri = urlparse.urlsplit(uristr)

    # If this isn't a URL, we don't perform any normalization
    if uri.scheme.lower() not in URL_SCHEMES:
        return uristr

    # Don't perform normalization on URLs with no hostname.
    if uri.hostname is None:
        return uristr

    scheme = uri.scheme
    netloc = _normalize_netloc(uri)
    path = _normalize_path(uri)
    query = _normalize_query(uri)
    fragment = None

    uri = urlparse.SplitResult(scheme, netloc, path, query, fragment)

    return uri.geturl()
Example #3
0
def origin(url):
    """
    Return a copy of ``url`` with the path, query string and fragment removed.

    ``url`` is assumed to be an HTTP(S) URL.
    """
    url_parts = urlparse.urlsplit(url)
    return urlparse.SplitResult(url_parts.scheme, url_parts.netloc, "", "",
                                "").geturl()
Example #4
0
def normalize(uristr):
    """
    Translate the given URI into a normalized form.

    :type uristr: unicode
    :rtype: unicode
    """

    # In Python 2 functions in urllib expect a byte string whereas in Python 3
    # some functions in urllib work with a byte string or unicode but
    # others (eg. `unquote`) require unicode.
    #
    # Hence we work with byte strings internally in Py 2 and unicode internally
    # in Python 3. In both we always return unicode.
    if PY2:
        uristr = uristr.encode("utf-8")

    def decode_result(result):
        if PY2:
            return result.decode("utf-8")
        else:
            return result

    # Strip proxy prefix for proxied URLs
    for scheme in URL_SCHEMES:
        if uristr.startswith(VIA_PREFIX + scheme + ":"):
            uristr = uristr[len(VIA_PREFIX) :]
            break

    # Try to extract the scheme
    uri = urlparse.urlsplit(uristr)

    # If this isn't a URL, we don't perform any normalization
    if uri.scheme.lower() not in URL_SCHEMES:
        return decode_result(uristr)

    # Don't perform normalization on URLs with no hostname.
    if uri.hostname is None:
        return decode_result(uristr)

    scheme = _normalize_scheme(uri)
    netloc = _normalize_netloc(uri)
    path = _normalize_path(uri)
    query = _normalize_query(uri)
    fragment = None

    uri = urlparse.SplitResult(scheme, netloc, path, query, fragment)

    return decode_result(uri.geturl())
Example #5
0
    def _parse_origin(self, uri):
        """
        Return the origin of a URI or None if empty or invalid.

        Per https://tools.ietf.org/html/rfc6454#section-7 :
        Return ``<scheme> + '://' + <host> + <port>``
        for a URI.

        :param uri: URI string
        """

        if uri is None:
            return None
        parsed = urlparse.urlsplit(uri)
        # netloc contains both host and port
        origin = urlparse.SplitResult(parsed.scheme, parsed.netloc, '', '', '')
        return origin.geturl() or None
Example #6
0
def parse_origin(url):
    """
    Return the origin of a URL or None if empty or invalid.

    Per https://tools.ietf.org/html/rfc6454#section-7 :
    Return ``<scheme> + '://' + <host> + <port>``
    for a URL.

    :param url: URL string
    :rtype: str or None
    """

    if url is None:
        return None
    parsed = urlparse.urlsplit(url)
    # netloc contains both host and port
    origin = urlparse.SplitResult(parsed.scheme, parsed.netloc, "", "", "")
    return origin.geturl() or None
Example #7
0
File: uri.py Project: openbizgit/h
def normalize(uristr):
    """Translate the given URI into a normalized form."""
    uristr = uristr.encode('utf-8')

    # Try to extract the scheme
    uri = urlparse.urlsplit(uristr)

    # If this isn't a URL, we don't perform any normalization
    if uri.scheme.lower() not in URL_SCHEMES:
        return uristr

    # Don't perform normalization on URLs with no hostname.
    if uri.hostname is None:
        return uristr

    scheme = uri.scheme
    netloc = _normalize_netloc(uri)
    path = _normalize_path(uri)
    query = _normalize_query(uri)
    fragment = None

    uri = urlparse.SplitResult(scheme, netloc, path, query, fragment)

    return uri.geturl()
Example #8
0
def _parse_path(url):
    """Return the path component of a URL string"""
    if url is None:
        return None
    parsed = urlparse.urlsplit(url)
    return parsed.path