def normalize(uristr): """Translate the given URI into a normalized form.""" uristr = uristr.encode('utf-8') # Strip proxy prefix for proxied URLs for scheme in URL_SCHEMES: if uristr.startswith(VIA_PREFIX + scheme + ':'): uristr = uristr[len(VIA_PREFIX):] break # Try to extract the scheme uri = urlparse.urlsplit(uristr) # If this isn't a URL, we don't perform any normalization if uri.scheme.lower() not in URL_SCHEMES: return text_type(uristr, 'utf-8') # Don't perform normalization on URLs with no hostname. if uri.hostname is None: return text_type(uristr, 'utf-8') scheme = _normalize_scheme(uri) netloc = _normalize_netloc(uri) path = _normalize_path(uri) query = _normalize_query(uri) fragment = None uri = urlparse.SplitResult(scheme, netloc, path, query, fragment) return text_type(uri.geturl(), 'utf-8')
def origin(url): """ Return a copy of ``url`` with the path, query string and fragment removed. ``url`` is assumed to be an HTTP(S) URL. """ url_parts = urlparse.urlsplit(url) return urlparse.SplitResult(url_parts.scheme, url_parts.netloc, "", "", "").geturl()
def normalize(uristr): """ Translate the given URI into a normalized form. :type uristr: unicode :rtype: unicode """ # In Python 2 functions in urllib expect a byte string whereas in Python 3 # some functions in urllib work with a byte string or unicode but # others (eg. `unquote`) require unicode. # # Hence we work with byte strings internally in Py 2 and unicode internally # in Python 3. In both we always return unicode. if PY2: uristr = uristr.encode("utf-8") def decode_result(result): if PY2: return result.decode("utf-8") else: return result # Strip proxy prefix for proxied URLs for scheme in URL_SCHEMES: if uristr.startswith(VIA_PREFIX + scheme + ":"): uristr = uristr[len(VIA_PREFIX) :] break # Try to extract the scheme uri = urlparse.urlsplit(uristr) # If this isn't a URL, we don't perform any normalization if uri.scheme.lower() not in URL_SCHEMES: return decode_result(uristr) # Don't perform normalization on URLs with no hostname. if uri.hostname is None: return decode_result(uristr) scheme = _normalize_scheme(uri) netloc = _normalize_netloc(uri) path = _normalize_path(uri) query = _normalize_query(uri) fragment = None uri = urlparse.SplitResult(scheme, netloc, path, query, fragment) return decode_result(uri.geturl())
def _parse_origin(self, uri): """ Return the origin of a URI or None if empty or invalid. Per https://tools.ietf.org/html/rfc6454#section-7 : Return ``<scheme> + '://' + <host> + <port>`` for a URI. :param uri: URI string """ if uri is None: return None parsed = urlparse.urlsplit(uri) # netloc contains both host and port origin = urlparse.SplitResult(parsed.scheme, parsed.netloc, '', '', '') return origin.geturl() or None
def parse_origin(url): """ Return the origin of a URL or None if empty or invalid. Per https://tools.ietf.org/html/rfc6454#section-7 : Return ``<scheme> + '://' + <host> + <port>`` for a URL. :param url: URL string :rtype: str or None """ if url is None: return None parsed = urlparse.urlsplit(url) # netloc contains both host and port origin = urlparse.SplitResult(parsed.scheme, parsed.netloc, "", "", "") return origin.geturl() or None
def normalize(uristr): """Translate the given URI into a normalized form.""" uristr = uristr.encode('utf-8') # Try to extract the scheme uri = urlparse.urlsplit(uristr) # If this isn't a URL, we don't perform any normalization if uri.scheme.lower() not in URL_SCHEMES: return uristr # Don't perform normalization on URLs with no hostname. if uri.hostname is None: return uristr scheme = uri.scheme netloc = _normalize_netloc(uri) path = _normalize_path(uri) query = _normalize_query(uri) fragment = None uri = urlparse.SplitResult(scheme, netloc, path, query, fragment) return uri.geturl()