Beispiel #1
0
def url_host_domain(url):
    """
    Return a tuple of the (host, domain) of a URL or None. Assumes that the
    URL has a scheme.
    """
    try:
        parsed = urlpy.parse(url)
        host = parsed.host
        if not host:
            return None, None
        domain = parsed.pld
        return host.lower(), domain.lower()
    except Exception as e:
        if TRACE:
            logger_debug('url_host_domain: failed for:', url, 'with:', repr(e))
        # ignore it
        return None, None
Beispiel #2
0
def canonical_url(uri):
    """
    Return the canonical representation of a given URI.
    This assumes the `uri` has a scheme.

    * When a default port corresponding for the scheme is explicitly declared
      (such as port 80 for http), the port will be removed from the output.
    * Fragments '#' are not removed.
     * Params and query string arguments are not reordered.
    """
    try:
        parsed = urlpy.parse(uri)
        if not parsed:
            return
        if TRACE:
            logger_debug('canonical_url: parsed:', parsed)

        sanitized = parsed.sanitize()

        if TRACE:
            logger_debug('canonical_url: sanitized:', sanitized)

        punycoded = sanitized.punycode()

        if TRACE:
            logger_debug('canonical_url: punycoded:', punycoded)

        deport = punycoded.remove_default_port()

        if TRACE:
            logger_debug('canonical_url: deport:', deport)

        return str(sanitized)
    except Exception as e:
        if TRACE:
            logger_debug('canonical_url: failed for:', uri, 'with:', repr(e))
        # ignore it
        pass