def url_host_domain(url): """ Return a tuple of the (host, domain) of a URL or None. Assumes that the URL has a scheme. """ try: parsed = urlpy.parse(url) host = parsed.host if not host: return None, None domain = parsed.pld return host.lower(), domain.lower() except Exception as e: if TRACE: logger_debug('url_host_domain: failed for:', url, 'with:', repr(e)) # ignore it return None, None
def canonical_url(uri): """ Return the canonical representation of a given URI. This assumes the `uri` has a scheme. * When a default port corresponding for the scheme is explicitly declared (such as port 80 for http), the port will be removed from the output. * Fragments '#' are not removed. * Params and query string arguments are not reordered. """ try: parsed = urlpy.parse(uri) if not parsed: return if TRACE: logger_debug('canonical_url: parsed:', parsed) sanitized = parsed.sanitize() if TRACE: logger_debug('canonical_url: sanitized:', sanitized) punycoded = sanitized.punycode() if TRACE: logger_debug('canonical_url: punycoded:', punycoded) deport = punycoded.remove_default_port() if TRACE: logger_debug('canonical_url: deport:', deport) return str(sanitized) except Exception as e: if TRACE: logger_debug('canonical_url: failed for:', uri, 'with:', repr(e)) # ignore it pass