Example #1
0
    def normalize_with_domain(self, raw_url):
        if (raw_url is None):
            return None

        normalized_web_url = web_domain_to_scheme_url(raw_url)
        normalized_url = norms(normalized_web_url)
        url_components = urlparse(normalized_url)
        split_url = list(url_components[0:3]) #We remove the query params & fragment from the URL
        split_url.extend(['','',''])
        is_scheme_empty = is_empty(url_components.scheme)
        if is_scheme_empty:
            split_url[0] = HTTP
        is_netloc_empty = is_empty(url_components.netloc)
        if is_netloc_empty and (is_scheme_empty or (split_url[0] in ACCEPTABLE_SCHEMES)):
            split_url[1] = self._netloc
        path = split_url[2]
        if path.endswith(PATH_SEPARATOR):
            split_url[2] = path[0:len(path) - 1]
        url_with_domain = urlunparse(tuple(split_url))
        return url_with_domain
Example #2
0
def extract_domain_port(reference_url):
    if is_empty(reference_url):
        raise ValueError("Input URL for domain extraction cannot be null")
    trimmed_url = reference_url.strip().lower()
    trimmed_url = web_domain_to_scheme_url(trimmed_url)
    raw_split_url = urlparse(trimmed_url)
    scheme = raw_split_url.scheme
    if not (scheme is None or scheme.strip().lower() in ACCEPTABLE_SCHEMES):
        raise ValueError("The URL scheme must be http or https")
    domain = raw_split_url.hostname
    if is_empty(domain):
        raise ValueError("Null or empty domain. Expected domain to be specified in the URL tuple %s "%str(raw_split_url))

    normalized_split_url = urlsplit(norms(trimmed_url))
    port = normalized_split_url.port
    if (port == HTTP_DEFAULT_PORT):
        port = None
    domain = normalized_split_url.hostname
    if domain.startswith(WWW_PREFIX):
        domain = domain[len(WWW_PREFIX):]
    if not _is_valid_domain(domain):
        raise ValueError("Invalid domain provided in the URL")
    return domain, port