def load_suffix_list(cache_dir="./cache"): cached_psl = cache_single("public-suffix-list.txt", cache_dir=cache_dir) if os.path.exists(cached_psl): logging.debug("Using cached Public Suffix List...") with codecs.open(cached_psl, encoding='utf-8') as psl_file: suffixes = publicsuffix.PublicSuffixList(psl_file) content = psl_file.readlines() else: # File does not exist, download current list and cache it at given location. logging.debug("Downloading the Public Suffix List...") try: cache_file = publicsuffix.fetch() except URLError as err: logging.warning("Unable to download the Public Suffix List...") logging.debug("{}".format(err)) return None, None content = cache_file.readlines() suffixes = publicsuffix.PublicSuffixList(content) # Cache for later. write(''.join(content), cached_psl) return suffixes, content
def __init__(self, queue_name: str, routing_keys: Sequence[str]): """Initialize the consumer, including the public suffix list.""" super().__init__(queue_name, routing_keys) # download the public suffix list (would be good to add caching here) psl_file = publicsuffix.fetch() self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def __init__(self, consumer_group: str, source_streams: Sequence[str]): """Initialize the consumer, including the public suffix list.""" super().__init__(consumer_group, source_streams) # download the public suffix list (would be good to add caching here) psl_file = publicsuffix.fetch() self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
def get_psl(location=PSL_CACHE_LOC): """ Grabs an updated public suffix list. """ if not os.path.isfile(location): psl_file = fetch() with codecs.open(location, 'w', encoding='utf8') as f: f.write(psl_file.read()) psl_cache = codecs.open(location, encoding='utf8') return PublicSuffixList(psl_cache)
def get_psl(): """ Grabs an updated public suffix list. """ if not os.path.isfile(PSL_CACHE_LOC): print "%s does not exist, downloading a copy." % PSL_CACHE_LOC psl_file = fetch() with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f: f.write(psl_file.read()) psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8') return PublicSuffixList(psl_cache)
def load_suffix_list(): # File does not exist, download current list and cache it at given location. utils.debug("Downloading the Public Suffix List...", divider=True) try: cache_file = fetch() except URLError as err: logging.warn("Unable to download the Public Suffix List...") utils.debug("{}".format(err)) return [] content = cache_file.readlines() suffixes = PublicSuffixList(content) return suffixes, content
def load_suffix_list(): if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE): logging.debug("Using cached suffix list.") cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8') suffixes = PublicSuffixList(cache_file) else: # File does not exist, download current list and cache it at given location. logging.debug("Downloading the Public Suffix List...") cache_file = fetch() content = cache_file.readlines() suffixes = PublicSuffixList(content) if SUFFIX_CACHE: logging.debug("Caching suffix list at %s" % SUFFIX_CACHE) utils.write(''.join(content), SUFFIX_CACHE) return suffixes
def __init__(self, host, port=0, auth=None, use_ssl=False, starttls=False, prefix="noreply"): self._host = host self._port = port auth = auth or {} self._auth_user = auth.get('user') self._auth_password = auth.get('password') self._use_ssl = use_ssl self._starttls = starttls self.psl_file = publicsuffix.fetch() self.psl = PublicSuffixList(self.psl_file) self.prefix = prefix
def initPublicSuffixList(self): if self.psl is not None: return self.psl try: if fileExists(self.pslCachePath) and getLastModifiedTimeSpent(self.pslCachePath, TIMESPENT_UNIT.DAYS) < self.timeSpentMax: pslFile = codecs.open(self.pslCachePath, encoding='utf8') self.psl = PublicSuffixList(pslFile) pslFile.close() return self.psl else: (dir, filename, ext, filenameExt) = decomposePath(self.pslCachePath) mkdirIfNotExists(dir) pslData = list(publicsuffix.fetch()) removeIfExists(self.pslCachePath) strToFile(pslData, self.pslCachePath) self.psl = PublicSuffixList(pslData) return self.psl except Exception as e: logException(e, self, location="initPublicSuffixList") return None
def run(self): """ Update the vendored public suffix list to the latest list from publicsuffix.org saved side-by-side this Python script. Also create an ABOUT file with download info including the download UTC date/time as the version (see http://aboutcode.org) """ from contextlib import closing from datetime import datetime from publicsuffix import fetch, PSL_URL, PSL_FILE, BASE_DIR ABOUT_PSL_FILE = join(BASE_DIR, 'public_suffix_list.ABOUT') ABOUT_TEMPLATE = ''' about_resource: public_suffix_list.dat name: Public Suffix List version: %(version)s download_url: %(PSL_URL)s home_url: https://publicsuffix.org/ owner: Mozilla copyright: Copyright (c) Mozilla and others license: mpl-2.0 license_text_file: mpl-2.0.LICENSE ''' # current date and time as an ISO time stamp string version = datetime.isoformat(datetime.utcnow()).partition('.')[0] glocals = dict(locals()) glocals.update(globals()) print('Fetching latest list from: %(PSL_URL)s on: %(version)s' % glocals) with closing(fetch()) as fetched: with codecs.open(PSL_FILE, 'wb', encoding='utf-8') as pslout: pslout.write(fetched.read()) with open(ABOUT_PSL_FILE, 'wb') as about: about.write(ABOUT_TEMPLATE % glocals) print('Saved updated %(PSL_FILE)s and %(ABOUT_PSL_FILE)s' % glocals)
def load_suffix_list(): if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE): utils.debug("Using cached suffix list.", divider=True) cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8') suffixes = PublicSuffixList(cache_file) else: # File does not exist, download current list and cache it at given location. utils.debug("Downloading the Public Suffix List...", divider=True) try: cache_file = fetch() except URLError as err: logging.warn("Unable to download the Public Suffix List...") utils.debug("{}".format(err)) return [] content = cache_file.readlines() suffixes = PublicSuffixList(content) if SUFFIX_CACHE: utils.debug("Caching suffix list at %s" % SUFFIX_CACHE, divider=True) utils.write(''.join(content), SUFFIX_CACHE) return suffixes
parser = argparse.ArgumentParser() parser.add_argument('file', type=argparse.FileType('r'), nargs='+') parser.add_argument('-v', '--verbose', action='store_const', const=logging.INFO, dest='loglevel', help='increase output verbosity.') parser.add_argument('-d', '--debug', action='store_const', const=logging.DEBUG, dest='loglevel', default=logging.WARNING, help='show debug output (even more than -v).') args = parser.parse_args() logging.basicConfig(level=args.loglevel) #seperate two input files f1 = args.file[0] f2 = args.file[1] #initialize dictionary and fetch public suffix list psl = PublicSuffixList(fetch()) dictionary = {}; #Grab domains from psl for line in f1.readlines(): slvl = psl.get_public_suffix(line.strip()) parts = slvl.split('.') dictionary[parts[0]] = 1 #Check if word is in dictionary, if so print it for lines in f2.readlines(): segments = lines.split('.') for name in segments: if dictionary.has_key(name): print lines break
class UrlUtil: """封装一些关于url的操作""" psl_file = fetch( ) # 加载https://publicsuffix.org/list/public_suffix_list.dat # psl_file = codecs.open('./public_suffix_list.dat', encoding='utf8') psl = PublicSuffixList(psl_file) @classmethod def get_protocol(cls, url): """抽取url的协议""" parse_result = parse.urlparse(url=url) return parse_result[0].strip() # 加上strip以防万一 @classmethod def get_domain(cls, url): """抽取url的域名""" parse_result = parse.urlparse(url=url) return parse_result[1].strip() # 有的链接域名最后跟了空白,chrome还能够正确的识别解析,神奇…… @classmethod def get_top_domain(cls, url): """抽取url的一级域名""" domain = UrlUtil.get_domain(url) domain = domain.split(':')[0] # 去掉端口 ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}" if re.match(ip_pattern, domain): return domain return cls.psl.get_public_suffix(domain) @classmethod def get_path(cls, url): """抽取url对应文件的路径(去掉文件本身)""" splites = url.split('/') if len(splites) == 3: return url elif len(splites) == 4 and splites[-1] == "": return url[:-1] return "/".join(url.split('/')[:-1]) @classmethod def is_gov_or_edu(cls, url): """判断url是否是政府或教育机构域名""" domain = UrlUtil.get_domain(url) if len(domain) > 7 and domain[-7:] in (".gov.cn", ".edu.cn"): return True return False @classmethod def top_domain_is_gov_or_edu(cls, top_domain): """判断主域名是否是政府或教育机构""" if top_domain in ("gov.cn", "edu.cn"): return True return False @classmethod def get_url_suffix(cls, url): """获取网页后缀名(如html、js、css)""" path = urllib.parse.urlsplit(url)[2] if '.' not in path.split('/')[-1]: return "" return path.split('.')[-1]
def download_psl(): fresh_psl = publicsuffix.fetch() with open(PublicSuffixListFilename, 'w', encoding='utf-8') as fresh_psl_file: fresh_psl_file.write(fresh_psl.read())
def get_suffix(self): suffix_list = fetch() psl = PublicSuffixList(suffix_list) return psl
class TestPublicSuffixLatest(TestPublicSuffixCurrent): """Test using the latest list""" psl = publicsuffix.fetch()
def test_fetch_amd_get_public_suffix(self): f = publicsuffix.fetch() psl = publicsuffix.PublicSuffixList(f) assert 'example.com' == psl.get_public_suffix('www.example.com') assert u('www.\u9999\u6e2f') == psl.get_public_suffix(u('www.\u9999\u6e2f'))