Example #1
0
def test_tld_extract():
    assert tld_extract("sub.test.com") == ("sub", "test", "com")
    assert tld_extract(".test.com") == ("", "test", "com")
    assert tld_extract(".test.com.") == ("", "test", "com")
    assert tld_extract(".www.test.com.") == ("www", "test", "com")

    assert tld_extract(u".www.test.com.") == ("www", "test", "com")
    assert [type(x) for x in tld_extract(u".www.test.com.")] == [str, str, str]
Example #2
0
def _fast_make_domain_id(domain):
    """ Experimental fast version bypassing cosrlib.URL """

    if domain not in URL_DOMAIN_IDS_CACHE:

        subdomain, domain, suffix = tld_extract(domain)

        if subdomain == "www" or not subdomain:
            URL_DOMAIN_IDS_CACHE[domain] = mmh3.hash("%s.%s" %
                                                     (domain, suffix))
        else:
            while subdomain.startswith("www."):
                subdomain = subdomain[4:]

            URL_DOMAIN_IDS_CACHE[domain] = ((mmh3.hash(subdomain) << 32) +
                                            mmh3.hash("%s.%s" %
                                                      (domain, suffix)))

    return URL_DOMAIN_IDS_CACHE[domain]
Example #3
0
def _fast_make_domain_id(domain):
    """ Experimental fast version bypassing cosrlib.URL """

    if domain not in URL_DOMAIN_IDS_CACHE:

        subdomain, domain, suffix = tld_extract(domain)

        if subdomain == "www" or not subdomain:
            URL_DOMAIN_IDS_CACHE[domain] = mmh3.hash("%s.%s" % (domain, suffix))
        else:
            while subdomain.startswith("www."):
                subdomain = subdomain[4:]

            URL_DOMAIN_IDS_CACHE[domain] = (
                (mmh3.hash(subdomain) << 32) +
                mmh3.hash("%s.%s" % (domain, suffix))
            )

    return URL_DOMAIN_IDS_CACHE[domain]