def load_suffix_list(cache_dir="./cache"): cached_psl = cache_single("public-suffix-list.txt", cache_dir=cache_dir) if os.path.exists(cached_psl): logging.debug("Using cached Public Suffix List...") with codecs.open(cached_psl, encoding='utf-8') as psl_file: suffixes = publicsuffix.PublicSuffixList(psl_file) content = psl_file.readlines() else: # File does not exist, download current list and cache it at given location. logging.debug("Downloading the Public Suffix List...") try: cache_file = publicsuffix.fetch() except URLError as err: logging.warning("Unable to download the Public Suffix List...") logging.debug("{}".format(err)) return None, None content = cache_file.readlines() suffixes = publicsuffix.PublicSuffixList(content) # Cache for later. write(''.join(content), cached_psl) return suffixes, content
def __init__(self, queue_name: str, routing_keys: Sequence[str]): """Initialize the consumer, including the public suffix list.""" super().__init__(queue_name, routing_keys) # download the public suffix list (would be good to add caching here) psl_file = publicsuffix.fetch() self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def test_empty_list(self): psl = publicsuffix.PublicSuffixList([]) assert 'com' == psl.get_public_suffix('com') assert 'com' == psl.get_public_suffix('COM') assert 'com' == psl.get_public_suffix('.com') assert 'com' == psl.get_public_suffix('a.example.com')
def test_get_public_suffix_from_list_with_unicode(self): psl = publicsuffix.PublicSuffixList([u('\u0440\u0444')]) assert u('\u0440\u0444') == psl.get_public_suffix(u('\u0440\u0444')) assert u('example.\u0440\u0444') == psl.get_public_suffix(u('example.\u0440\u0444')) assert u('example.\u0440\u0444') == psl.get_public_suffix(u('a.example.\u0440\u0444')) assert u('example.\u0440\u0444') == psl.get_public_suffix(u('a.a.example.\u0440\u0444'))
def __init__(self, consumer_group: str, source_streams: Sequence[str]): """Initialize the consumer, including the public suffix list.""" super().__init__(consumer_group, source_streams) # download the public suffix list (would be good to add caching here) psl_file = publicsuffix.fetch() self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def get_psl(): """ Gets the Public Suffix List - either new, or cached in the CWD for 24 hours Returns ------- PublicSuffixList: An instance of PublicSuffixList loaded with a cached or updated list """ def download_psl(): fresh_psl = publicsuffix.fetch() with open(PublicSuffixListFilename, 'w', encoding='utf-8') as fresh_psl_file: fresh_psl_file.write(fresh_psl.read()) # Download the psl if necessary if not PublicSuffixListReadOnly: if not path.exists(PublicSuffixListFilename): download_psl() else: psl_age = datetime.now() - datetime.fromtimestamp(stat(PublicSuffixListFilename).st_mtime) if psl_age > timedelta(hours=24): download_psl() with open(PublicSuffixListFilename, encoding='utf-8') as psl_file: psl = publicsuffix.PublicSuffixList(psl_file) return psl
def test_basic(self): psl = publicsuffix.PublicSuffixList(['com']) assert 'example.com' == psl.get_public_suffix('a.example.com') assert 'example.com' == psl.get_public_suffix('a.a.example.com') assert 'example.com' == psl.get_public_suffix('a.a.a.example.com') assert 'example.com' == psl.get_public_suffix('A.example.com') assert 'example.com' == psl.get_public_suffix('.a.a.example.com')
def trunc_tracker(rawtracker): urlres = urlparse(rawtracker) urlinfo = (urlres.netloc.split(":")) if urlinfo[0] == '': return 'NONE' else: domainret = publicsuffix.PublicSuffixList().get_public_suffix(urlinfo[0]) log.debug('trunc_tracker returning domain name {} from string {}'.format(domainret,rawtracker)) return domainret
def test_get_public_suffix_from_list_with_exception_rule(self): psl = publicsuffix.PublicSuffixList(['*.example.com', '!b.example.com']) assert 'a.example.com' == psl.get_public_suffix('a.example.com') assert 'a.a.example.com' == psl.get_public_suffix('a.a.example.com') assert 'a.a.example.com' == psl.get_public_suffix('a.a.a.example.com') assert 'a.a.example.com' == psl.get_public_suffix('a.a.a.a.example.com') assert 'b.example.com' == psl.get_public_suffix('b.example.com') assert 'b.example.com' == psl.get_public_suffix('b.b.example.com') assert 'b.example.com' == psl.get_public_suffix('b.b.b.example.com') assert 'b.example.com' == psl.get_public_suffix('b.b.b.b.example.com')
def get_base_domain(domain): """ Gets the base domain name for the given domain .. note:: Results are based on a list of public domain suffixes at https://publicsuffix.org/list/public_suffix_list.dat. This file is saved to the current working directory, where it is used as a cache file for 24 hours. Args: domain (str): A domain or subdomain Returns: str: The base domain of the given domain """ psl_path = ".public_suffix_list.dat" def download_psl(): url = "https://publicsuffix.org/list/public_suffix_list.dat" # Use a browser-like user agent string to bypass some proxy blocks headers = {"User-Agent": USER_AGENT} fresh_psl = requests.get(url, headers=headers).text with open(psl_path, "w", encoding="utf-8") as fresh_psl_file: fresh_psl_file.write(fresh_psl) if not os.path.exists(psl_path): download_psl() else: psl_age = datetime.now() - datetime.fromtimestamp( os.stat(psl_path).st_mtime) if psl_age > timedelta(hours=24): try: download_psl() except Exception as error: logger.warning( "Failed to download an updated PSL {0}".format(error)) with open(psl_path, encoding="utf-8") as psl_file: psl = publicsuffix.PublicSuffixList(psl_file) return psl.get_public_suffix(domain)
def domain_from_host(host): '''Return the domain part of a host. @type host: string @param host: the host to extract the domain from @rtype: string @return: the extracted domain ''' if publicsuffix: global publicsuffixlist if publicsuffixlist is None: publicsuffixlist = publicsuffix.PublicSuffixList() domain = publicsuffixlist.get_public_suffix(host) else: d = host.split('.') if len(d) > 1: domain = '%s.%s' % (d[-2], d[-1]) else: domain = host return domain
def __init__(self): self.psl = publicsuffix.PublicSuffixList()
def test_fetch_amd_get_public_suffix(self): f = publicsuffix.fetch() psl = publicsuffix.PublicSuffixList(f) assert 'example.com' == psl.get_public_suffix('www.example.com') assert u('www.\u9999\u6e2f') == psl.get_public_suffix(u('www.\u9999\u6e2f'))
def test_fqdn(self): psl = publicsuffix.PublicSuffixList(['com']) assert 'example.com' == psl.get_public_suffix('example.com.')
import sqlite3 import re import urlparse import publicsuffix if (len(sys.argv) != 2 or not os.path.isfile(sys.argv[1])): print "Usage: python make_pages_public_suffixes.py FOURTHPARTY_DB" sys.exit() dbFileName = sys.argv[1] dbConnection = sqlite3.connect(dbFileName) dbConnection.row_factory = sqlite3.Row dbCursor = dbConnection.cursor() ipRegex = re.compile(r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}") psl = publicsuffix.PublicSuffixList() dbCursor.execute("ALTER TABLE pages ADD public_suffix TEXT") dbCursor.execute("SELECT id, location FROM pages") pagesRows = dbCursor.fetchall() for pagesRow in pagesRows: pageID = pagesRow['id'] pageURL = pagesRow['location'] pageHostName = urlparse.urlparse(pageURL).hostname if pageHostName: if ipRegex.match(pageHostName): pagePublicSuffix = pageHostName else: pagePublicSuffix = psl.get_public_suffix(pageHostName) dbCursor.execute("UPDATE pages SET public_suffix=? WHERE id=?", (pagePublicSuffix, pageID))
def url_public_suffix(self): global _psl if _psl is None: _psl = publicsuffix.PublicSuffixList() return _psl.get_public_suffix(self.url_domain)
def test_with_full_publicsuffix_org_list(self): psl = publicsuffix.PublicSuffixList() # Mixed case. assert 'com' == psl.get_public_suffix('COM') assert 'example.com' == psl.get_public_suffix('example.COM') assert 'example.com' == psl.get_public_suffix('WwW.example.COM') # Leading dot. assert 'com' == psl.get_public_suffix('.com') assert 'example' == psl.get_public_suffix('.example') assert 'example.com' == psl.get_public_suffix('.example.com') assert 'example' == psl.get_public_suffix('.example.example') # Unlisted TLD. assert 'example' == psl.get_public_suffix('example') assert 'example' == psl.get_public_suffix('example.example') assert 'example' == psl.get_public_suffix('b.example.example') assert 'example' == psl.get_public_suffix('a.b.example.example') # Listed, but non-Internet, TLD. assert 'local' == psl.get_public_suffix('local') assert 'local' == psl.get_public_suffix('example.local') assert 'local' == psl.get_public_suffix('b.example.local') assert 'local' == psl.get_public_suffix('a.b.example.local') # TLD with only one rule. assert 'biz' == psl.get_public_suffix('biz') assert 'domain.biz' == psl.get_public_suffix('domain.biz') assert 'domain.biz' == psl.get_public_suffix('b.domain.biz') assert 'domain.biz' == psl.get_public_suffix('a.b.domain.biz') # TLD with some two-level rules. assert 'com' == psl.get_public_suffix('com') assert 'example.com' == psl.get_public_suffix('example.com') assert 'example.com' == psl.get_public_suffix('b.example.com') assert 'example.com' == psl.get_public_suffix('a.b.example.com') assert 'uk.com' == psl.get_public_suffix('uk.com') assert 'example.uk.com' == psl.get_public_suffix('example.uk.com') assert 'example.uk.com' == psl.get_public_suffix('b.example.uk.com') assert 'example.uk.com' == psl.get_public_suffix('a.b.example.uk.com') assert 'test.ac' == psl.get_public_suffix('test.ac') # TLD with only one wildcard rule. assert 'cy' == psl.get_public_suffix('cy') assert 'c.cy' == psl.get_public_suffix('c.cy') assert 'b.c.cy' == psl.get_public_suffix('b.c.cy') assert 'b.c.cy' == psl.get_public_suffix('a.b.c.cy') # More complex TLD. assert 'jp' == psl.get_public_suffix('jp') assert 'test.jp' == psl.get_public_suffix('test.jp') assert 'test.jp' == psl.get_public_suffix('www.test.jp') assert 'ac.jp' == psl.get_public_suffix('ac.jp') assert 'test.ac.jp' == psl.get_public_suffix('test.ac.jp') assert 'test.ac.jp' == psl.get_public_suffix('www.test.ac.jp') assert 'kobe.jp' == psl.get_public_suffix('kobe.jp') assert 'c.kobe.jp' == psl.get_public_suffix('c.kobe.jp') assert 'b.c.kobe.jp' == psl.get_public_suffix('b.c.kobe.jp') assert 'b.c.kobe.jp' == psl.get_public_suffix('a.b.c.kobe.jp') # Exception rule. assert 'city.kobe.jp' == psl.get_public_suffix('city.kobe.jp') assert 'city.kobe.jp' == psl.get_public_suffix('www.city.kobe.jp') # US K12. assert 'us' == psl.get_public_suffix('us') assert 'test.us' == psl.get_public_suffix('test.us') assert 'test.us' == psl.get_public_suffix('www.test.us') assert 'ak.us' == psl.get_public_suffix('ak.us') assert 'test.ak.us' == psl.get_public_suffix('test.ak.us') assert 'test.ak.us' == psl.get_public_suffix('www.test.ak.us') assert 'k12.ak.us' == psl.get_public_suffix('k12.ak.us') assert 'test.k12.ak.us' == psl.get_public_suffix('test.k12.ak.us') assert 'test.k12.ak.us' == psl.get_public_suffix('www.test.k12.ak.us')