def normalize_with_domain(self, raw_url): if (raw_url is None): return None normalized_web_url = web_domain_to_scheme_url(raw_url) normalized_url = norms(normalized_web_url) url_components = urlparse(normalized_url) split_url = list(url_components[0:3]) #We remove the query params & fragment from the URL split_url.extend(['','','']) is_scheme_empty = is_empty(url_components.scheme) if is_scheme_empty: split_url[0] = HTTP is_netloc_empty = is_empty(url_components.netloc) if is_netloc_empty and (is_scheme_empty or (split_url[0] in ACCEPTABLE_SCHEMES)): split_url[1] = self._netloc path = split_url[2] if path.endswith(PATH_SEPARATOR): split_url[2] = path[0:len(path) - 1] url_with_domain = urlunparse(tuple(split_url)) return url_with_domain
def extract_domain_port(reference_url): if is_empty(reference_url): raise ValueError("Input URL for domain extraction cannot be null") trimmed_url = reference_url.strip().lower() trimmed_url = web_domain_to_scheme_url(trimmed_url) raw_split_url = urlparse(trimmed_url) scheme = raw_split_url.scheme if not (scheme is None or scheme.strip().lower() in ACCEPTABLE_SCHEMES): raise ValueError("The URL scheme must be http or https") domain = raw_split_url.hostname if is_empty(domain): raise ValueError("Null or empty domain. Expected domain to be specified in the URL tuple %s "%str(raw_split_url)) normalized_split_url = urlsplit(norms(trimmed_url)) port = normalized_split_url.port if (port == HTTP_DEFAULT_PORT): port = None domain = normalized_split_url.hostname if domain.startswith(WWW_PREFIX): domain = domain[len(WWW_PREFIX):] if not _is_valid_domain(domain): raise ValueError("Invalid domain provided in the URL") return domain, port