Exemple #1
0
def load_suffix_list(cache_dir="./cache"):

    cached_psl = cache_single("public-suffix-list.txt", cache_dir=cache_dir)

    if os.path.exists(cached_psl):
        logging.debug("Using cached Public Suffix List...")
        with codecs.open(cached_psl, encoding='utf-8') as psl_file:
            suffixes = publicsuffix.PublicSuffixList(psl_file)
            content = psl_file.readlines()
    else:
        # File does not exist, download current list and cache it at given location.
        logging.debug("Downloading the Public Suffix List...")
        try:
            cache_file = publicsuffix.fetch()
        except URLError as err:
            logging.warning("Unable to download the Public Suffix List...")
            logging.debug("{}".format(err))
            return None, None

        content = cache_file.readlines()
        suffixes = publicsuffix.PublicSuffixList(content)

        # Cache for later.
        write(''.join(content), cached_psl)

    return suffixes, content
    def __init__(self, queue_name: str, routing_keys: Sequence[str]):
        """Initialize the consumer, including the public suffix list."""
        super().__init__(queue_name, routing_keys)

        # download the public suffix list (would be good to add caching here)
        psl_file = publicsuffix.fetch()
        self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
Exemple #3
0
    def test_empty_list(self):
        psl = publicsuffix.PublicSuffixList([])

        assert 'com' == psl.get_public_suffix('com')
        assert 'com' == psl.get_public_suffix('COM')
        assert 'com' == psl.get_public_suffix('.com')
        assert 'com' == psl.get_public_suffix('a.example.com')
Exemple #4
0
    def test_get_public_suffix_from_list_with_unicode(self):
        psl = publicsuffix.PublicSuffixList([u('\u0440\u0444')])

        assert u('\u0440\u0444') == psl.get_public_suffix(u('\u0440\u0444'))
        assert u('example.\u0440\u0444') == psl.get_public_suffix(u('example.\u0440\u0444'))
        assert u('example.\u0440\u0444') == psl.get_public_suffix(u('a.example.\u0440\u0444'))
        assert u('example.\u0440\u0444') == psl.get_public_suffix(u('a.a.example.\u0440\u0444'))
    def __init__(self, consumer_group: str, source_streams: Sequence[str]):
        """Initialize the consumer, including the public suffix list."""
        super().__init__(consumer_group, source_streams)

        # download the public suffix list (would be good to add caching here)
        psl_file = publicsuffix.fetch()
        self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
Exemple #6
0
def get_psl():
    """
    Gets the Public Suffix List - either new, or cached in the CWD for 24 hours

    Returns
    -------
    PublicSuffixList: An instance of PublicSuffixList loaded with a cached or updated list
    """

    def download_psl():
        fresh_psl = publicsuffix.fetch()
        with open(PublicSuffixListFilename, 'w', encoding='utf-8') as fresh_psl_file:
            fresh_psl_file.write(fresh_psl.read())

    # Download the psl if necessary
    if not PublicSuffixListReadOnly:
        if not path.exists(PublicSuffixListFilename):
            download_psl()
        else:
            psl_age = datetime.now() - datetime.fromtimestamp(stat(PublicSuffixListFilename).st_mtime)
            if psl_age > timedelta(hours=24):
                download_psl()

    with open(PublicSuffixListFilename, encoding='utf-8') as psl_file:
        psl = publicsuffix.PublicSuffixList(psl_file)

    return psl
Exemple #7
0
    def test_basic(self):
        psl = publicsuffix.PublicSuffixList(['com'])

        assert 'example.com' == psl.get_public_suffix('a.example.com')
        assert 'example.com' == psl.get_public_suffix('a.a.example.com')
        assert 'example.com' == psl.get_public_suffix('a.a.a.example.com')
        assert 'example.com' == psl.get_public_suffix('A.example.com')
        assert 'example.com' == psl.get_public_suffix('.a.a.example.com')
def trunc_tracker(rawtracker):
    urlres = urlparse(rawtracker)
    urlinfo = (urlres.netloc.split(":"))
    if urlinfo[0] == '':
        return 'NONE'
    else:
        domainret = publicsuffix.PublicSuffixList().get_public_suffix(urlinfo[0])
        log.debug('trunc_tracker returning domain name {} from string {}'.format(domainret,rawtracker))
        return domainret
Exemple #9
0
    def test_get_public_suffix_from_list_with_exception_rule(self):
        psl = publicsuffix.PublicSuffixList(['*.example.com', '!b.example.com'])

        assert 'a.example.com' == psl.get_public_suffix('a.example.com')
        assert 'a.a.example.com' == psl.get_public_suffix('a.a.example.com')
        assert 'a.a.example.com' == psl.get_public_suffix('a.a.a.example.com')
        assert 'a.a.example.com' == psl.get_public_suffix('a.a.a.a.example.com')

        assert 'b.example.com' == psl.get_public_suffix('b.example.com')
        assert 'b.example.com' == psl.get_public_suffix('b.b.example.com')
        assert 'b.example.com' == psl.get_public_suffix('b.b.b.example.com')
        assert 'b.example.com' == psl.get_public_suffix('b.b.b.b.example.com')
Exemple #10
0
def get_base_domain(domain):
    """
    Gets the base domain name for the given domain

    .. note::
        Results are based on a list of public domain suffixes at
        https://publicsuffix.org/list/public_suffix_list.dat.

        This file is saved to the current working directory,
        where it is used as a cache file for 24 hours.

    Args:
        domain (str): A domain or subdomain

    Returns:
        str: The base domain of the given domain

    """
    psl_path = ".public_suffix_list.dat"

    def download_psl():
        url = "https://publicsuffix.org/list/public_suffix_list.dat"
        # Use a browser-like user agent string to bypass some proxy blocks
        headers = {"User-Agent": USER_AGENT}
        fresh_psl = requests.get(url, headers=headers).text
        with open(psl_path, "w", encoding="utf-8") as fresh_psl_file:
            fresh_psl_file.write(fresh_psl)

    if not os.path.exists(psl_path):
        download_psl()
    else:
        psl_age = datetime.now() - datetime.fromtimestamp(
            os.stat(psl_path).st_mtime)
        if psl_age > timedelta(hours=24):
            try:
                download_psl()
            except Exception as error:
                logger.warning(
                    "Failed to download an updated PSL {0}".format(error))
    with open(psl_path, encoding="utf-8") as psl_file:
        psl = publicsuffix.PublicSuffixList(psl_file)

    return psl.get_public_suffix(domain)
Exemple #11
0
def domain_from_host(host):
    '''Return the domain part of a host.

    @type  host: string
    @param host: the host to extract the domain from
    @rtype:      string
    @return:     the extracted domain
    '''

    if publicsuffix:
        global publicsuffixlist
        if publicsuffixlist is None:
            publicsuffixlist = publicsuffix.PublicSuffixList()
        domain = publicsuffixlist.get_public_suffix(host)
    else:
        d = host.split('.')
        if len(d) > 1:
            domain = '%s.%s' % (d[-2], d[-1])
        else:
            domain = host
    return domain
 def __init__(self):
     self.psl = publicsuffix.PublicSuffixList()
Exemple #13
0
 def test_fetch_amd_get_public_suffix(self):
     f = publicsuffix.fetch()
     psl = publicsuffix.PublicSuffixList(f)
     assert 'example.com' == psl.get_public_suffix('www.example.com')
     assert u('www.\u9999\u6e2f') == psl.get_public_suffix(u('www.\u9999\u6e2f'))
Exemple #14
0
    def test_fqdn(self):
        psl = publicsuffix.PublicSuffixList(['com'])

        assert 'example.com' == psl.get_public_suffix('example.com.')
import sqlite3
import re
import urlparse
import publicsuffix

if (len(sys.argv) != 2 or not os.path.isfile(sys.argv[1])):
    print "Usage: python make_pages_public_suffixes.py FOURTHPARTY_DB"
    sys.exit()

dbFileName = sys.argv[1]
dbConnection = sqlite3.connect(dbFileName)
dbConnection.row_factory = sqlite3.Row
dbCursor = dbConnection.cursor()

ipRegex = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}")
psl = publicsuffix.PublicSuffixList()

dbCursor.execute("ALTER TABLE pages ADD public_suffix TEXT")
dbCursor.execute("SELECT id, location FROM pages")
pagesRows = dbCursor.fetchall()
for pagesRow in pagesRows:
    pageID = pagesRow['id']
    pageURL = pagesRow['location']
    pageHostName = urlparse.urlparse(pageURL).hostname
    if pageHostName:
        if ipRegex.match(pageHostName):
            pagePublicSuffix = pageHostName
        else:
            pagePublicSuffix = psl.get_public_suffix(pageHostName)
        dbCursor.execute("UPDATE pages SET public_suffix=? WHERE id=?",
                         (pagePublicSuffix, pageID))
 def url_public_suffix(self):
     global _psl
     if _psl is None:
         _psl = publicsuffix.PublicSuffixList()
     return _psl.get_public_suffix(self.url_domain)
Exemple #17
0
    def test_with_full_publicsuffix_org_list(self):
        psl = publicsuffix.PublicSuffixList()

        # Mixed case.
        assert 'com' == psl.get_public_suffix('COM')
        assert 'example.com' == psl.get_public_suffix('example.COM')
        assert 'example.com' == psl.get_public_suffix('WwW.example.COM')

        # Leading dot.
        assert 'com' == psl.get_public_suffix('.com')
        assert 'example' == psl.get_public_suffix('.example')
        assert 'example.com' == psl.get_public_suffix('.example.com')
        assert 'example' == psl.get_public_suffix('.example.example')

        # Unlisted TLD.
        assert 'example' == psl.get_public_suffix('example')
        assert 'example' == psl.get_public_suffix('example.example')
        assert 'example' == psl.get_public_suffix('b.example.example')
        assert 'example' == psl.get_public_suffix('a.b.example.example')

        # Listed, but non-Internet, TLD.
        assert 'local' == psl.get_public_suffix('local')
        assert 'local' == psl.get_public_suffix('example.local')
        assert 'local' == psl.get_public_suffix('b.example.local')
        assert 'local' == psl.get_public_suffix('a.b.example.local')

        # TLD with only one rule.
        assert 'biz' == psl.get_public_suffix('biz')
        assert 'domain.biz' == psl.get_public_suffix('domain.biz')
        assert 'domain.biz' == psl.get_public_suffix('b.domain.biz')
        assert 'domain.biz' == psl.get_public_suffix('a.b.domain.biz')

        # TLD with some two-level rules.
        assert 'com' == psl.get_public_suffix('com')
        assert 'example.com' == psl.get_public_suffix('example.com')
        assert 'example.com' == psl.get_public_suffix('b.example.com')
        assert 'example.com' == psl.get_public_suffix('a.b.example.com')
        assert 'uk.com' == psl.get_public_suffix('uk.com')
        assert 'example.uk.com' == psl.get_public_suffix('example.uk.com')
        assert 'example.uk.com' == psl.get_public_suffix('b.example.uk.com')
        assert 'example.uk.com' == psl.get_public_suffix('a.b.example.uk.com')
        assert 'test.ac' == psl.get_public_suffix('test.ac')

        # TLD with only one wildcard rule.
        assert 'cy' == psl.get_public_suffix('cy')
        assert 'c.cy' == psl.get_public_suffix('c.cy')
        assert 'b.c.cy' == psl.get_public_suffix('b.c.cy')
        assert 'b.c.cy' == psl.get_public_suffix('a.b.c.cy')

        # More complex TLD.
        assert 'jp' == psl.get_public_suffix('jp')
        assert 'test.jp' == psl.get_public_suffix('test.jp')
        assert 'test.jp' == psl.get_public_suffix('www.test.jp')
        assert 'ac.jp' == psl.get_public_suffix('ac.jp')
        assert 'test.ac.jp' == psl.get_public_suffix('test.ac.jp')
        assert 'test.ac.jp' == psl.get_public_suffix('www.test.ac.jp')
        assert 'kobe.jp' == psl.get_public_suffix('kobe.jp')
        assert 'c.kobe.jp' == psl.get_public_suffix('c.kobe.jp')
        assert 'b.c.kobe.jp' == psl.get_public_suffix('b.c.kobe.jp')
        assert 'b.c.kobe.jp' == psl.get_public_suffix('a.b.c.kobe.jp')

        # Exception rule.
        assert 'city.kobe.jp' == psl.get_public_suffix('city.kobe.jp')
        assert 'city.kobe.jp' == psl.get_public_suffix('www.city.kobe.jp')

        # US K12.
        assert 'us' == psl.get_public_suffix('us')
        assert 'test.us' == psl.get_public_suffix('test.us')
        assert 'test.us' == psl.get_public_suffix('www.test.us')
        assert 'ak.us' == psl.get_public_suffix('ak.us')
        assert 'test.ak.us' == psl.get_public_suffix('test.ak.us')
        assert 'test.ak.us' == psl.get_public_suffix('www.test.ak.us')
        assert 'k12.ak.us' == psl.get_public_suffix('k12.ak.us')
        assert 'test.k12.ak.us' == psl.get_public_suffix('test.k12.ak.us')
        assert 'test.k12.ak.us' == psl.get_public_suffix('www.test.k12.ak.us')