Example #1
0
def load_suffix_list(cache_dir="./cache"):

    cached_psl = cache_single("public-suffix-list.txt", cache_dir=cache_dir)

    if os.path.exists(cached_psl):
        logging.debug("Using cached Public Suffix List...")
        with codecs.open(cached_psl, encoding='utf-8') as psl_file:
            suffixes = publicsuffix.PublicSuffixList(psl_file)
            content = psl_file.readlines()
    else:
        # File does not exist, download current list and cache it at given location.
        logging.debug("Downloading the Public Suffix List...")
        try:
            cache_file = publicsuffix.fetch()
        except URLError as err:
            logging.warning("Unable to download the Public Suffix List...")
            logging.debug("{}".format(err))
            return None, None

        content = cache_file.readlines()
        suffixes = publicsuffix.PublicSuffixList(content)

        # Cache for later.
        write(''.join(content), cached_psl)

    return suffixes, content
    def __init__(self, queue_name: str, routing_keys: Sequence[str]):
        """Initialize the consumer, including the public suffix list."""
        super().__init__(queue_name, routing_keys)

        # download the public suffix list (would be good to add caching here)
        psl_file = publicsuffix.fetch()
        self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
Example #3
0
    def __init__(self, consumer_group: str, source_streams: Sequence[str]):
        """Initialize the consumer, including the public suffix list."""
        super().__init__(consumer_group, source_streams)

        # download the public suffix list (would be good to add caching here)
        psl_file = publicsuffix.fetch()
        self.public_suffix_list = publicsuffix.PublicSuffixList(psl_file)
Example #4
0
def load_suffix_list(cache_dir="./cache"):

    cached_psl = cache_single("public-suffix-list.txt", cache_dir=cache_dir)

    if os.path.exists(cached_psl):
        logging.debug("Using cached Public Suffix List...")
        with codecs.open(cached_psl, encoding='utf-8') as psl_file:
            suffixes = publicsuffix.PublicSuffixList(psl_file)
            content = psl_file.readlines()
    else:
        # File does not exist, download current list and cache it at given location.
        logging.debug("Downloading the Public Suffix List...")
        try:
            cache_file = publicsuffix.fetch()
        except URLError as err:
            logging.warning("Unable to download the Public Suffix List...")
            logging.debug("{}".format(err))
            return None, None

        content = cache_file.readlines()
        suffixes = publicsuffix.PublicSuffixList(content)

        # Cache for later.
        write(''.join(content), cached_psl)

    return suffixes, content
Example #5
0
def get_psl(location=PSL_CACHE_LOC):
    """
    Grabs an updated public suffix list.
    """
    if not os.path.isfile(location):
        psl_file = fetch()
        with codecs.open(location, 'w', encoding='utf8') as f:
            f.write(psl_file.read())
    psl_cache = codecs.open(location, encoding='utf8')
    return PublicSuffixList(psl_cache)
Example #6
0
def get_psl():
    """
    Grabs an updated public suffix list.
    """
    if not os.path.isfile(PSL_CACHE_LOC):
        print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
        psl_file = fetch()
        with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
            f.write(psl_file.read())
    psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
    return PublicSuffixList(psl_cache)
Example #7
0
def get_psl():
    """
    Grabs an updated public suffix list.
    """
    if not os.path.isfile(PSL_CACHE_LOC):
        print "%s does not exist, downloading a copy." % PSL_CACHE_LOC
        psl_file = fetch()
        with codecs.open(PSL_CACHE_LOC, 'w', encoding='utf8') as f:
            f.write(psl_file.read())
    psl_cache = codecs.open(PSL_CACHE_LOC, encoding='utf8')
    return PublicSuffixList(psl_cache)
Example #8
0
def load_suffix_list():
    # File does not exist, download current list and cache it at given location.
    utils.debug("Downloading the Public Suffix List...", divider=True)
    try:
        cache_file = fetch()
    except URLError as err:
        logging.warn("Unable to download the Public Suffix List...")
        utils.debug("{}".format(err))
        return []
    content = cache_file.readlines()
    suffixes = PublicSuffixList(content)
    return suffixes, content
Example #9
0
def load_suffix_list():
    # File does not exist, download current list and cache it at given location.
    utils.debug("Downloading the Public Suffix List...", divider=True)
    try:
        cache_file = fetch()
    except URLError as err:
        logging.warn("Unable to download the Public Suffix List...")
        utils.debug("{}".format(err))
        return []
    content = cache_file.readlines()
    suffixes = PublicSuffixList(content)
    return suffixes, content
Example #10
0
def load_suffix_list():
    if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE):
        logging.debug("Using cached suffix list.")
        cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8')
        suffixes = PublicSuffixList(cache_file)
    else:
        # File does not exist, download current list and cache it at given location.
        logging.debug("Downloading the Public Suffix List...")
        cache_file = fetch()
        content = cache_file.readlines()
        suffixes = PublicSuffixList(content)

        if SUFFIX_CACHE:
            logging.debug("Caching suffix list at %s" % SUFFIX_CACHE)
            utils.write(''.join(content), SUFFIX_CACHE)

    return suffixes
Example #11
0
 def __init__(self,
              host,
              port=0,
              auth=None,
              use_ssl=False,
              starttls=False,
              prefix="noreply"):
     self._host = host
     self._port = port
     auth = auth or {}
     self._auth_user = auth.get('user')
     self._auth_password = auth.get('password')
     self._use_ssl = use_ssl
     self._starttls = starttls
     self.psl_file = publicsuffix.fetch()
     self.psl = PublicSuffixList(self.psl_file)
     self.prefix = prefix
Example #12
0
 def initPublicSuffixList(self):
     if self.psl is not None:
         return self.psl
     try:
         if fileExists(self.pslCachePath) and getLastModifiedTimeSpent(self.pslCachePath, TIMESPENT_UNIT.DAYS) < self.timeSpentMax:
             pslFile = codecs.open(self.pslCachePath, encoding='utf8')
             self.psl = PublicSuffixList(pslFile)
             pslFile.close()
             return self.psl
         else:
             (dir, filename, ext, filenameExt) = decomposePath(self.pslCachePath)
             mkdirIfNotExists(dir)
             pslData = list(publicsuffix.fetch())
             removeIfExists(self.pslCachePath)
             strToFile(pslData, self.pslCachePath)
             self.psl = PublicSuffixList(pslData)
             return self.psl
     except Exception as e:
         logException(e, self, location="initPublicSuffixList")
         return None
Example #13
0
    def run(self):
        """
        Update the vendored public suffix list to the latest list from
        publicsuffix.org saved side-by-side this Python script. 
    
        Also create an ABOUT file with download info including the download UTC
        date/time as the version (see http://aboutcode.org)
        """

        from contextlib import closing
        from datetime import datetime
        from publicsuffix import fetch, PSL_URL, PSL_FILE, BASE_DIR

        ABOUT_PSL_FILE = join(BASE_DIR, 'public_suffix_list.ABOUT')

        ABOUT_TEMPLATE = '''
about_resource: public_suffix_list.dat
name: Public Suffix List
version: %(version)s
download_url: %(PSL_URL)s
home_url: https://publicsuffix.org/

owner: Mozilla
copyright: Copyright (c) Mozilla and others
license: mpl-2.0
license_text_file: mpl-2.0.LICENSE
'''

        # current date and time as an ISO time stamp string
        version = datetime.isoformat(datetime.utcnow()).partition('.')[0]
        glocals = dict(locals())
        glocals.update(globals())
        print('Fetching latest list from: %(PSL_URL)s on: %(version)s' %
              glocals)
        with closing(fetch()) as fetched:
            with codecs.open(PSL_FILE, 'wb', encoding='utf-8') as pslout:
                pslout.write(fetched.read())
        with open(ABOUT_PSL_FILE, 'wb') as about:
            about.write(ABOUT_TEMPLATE % glocals)
        print('Saved updated %(PSL_FILE)s and %(ABOUT_PSL_FILE)s' % glocals)
Example #14
0
    def run(self):
        """
        Update the vendored public suffix list to the latest list from
        publicsuffix.org saved side-by-side this Python script. 
    
        Also create an ABOUT file with download info including the download UTC
        date/time as the version (see http://aboutcode.org)
        """

        from contextlib import closing
        from datetime import datetime
        from publicsuffix import fetch, PSL_URL, PSL_FILE, BASE_DIR

        ABOUT_PSL_FILE = join(BASE_DIR, 'public_suffix_list.ABOUT')

        ABOUT_TEMPLATE = '''
about_resource: public_suffix_list.dat
name: Public Suffix List
version: %(version)s
download_url: %(PSL_URL)s
home_url: https://publicsuffix.org/

owner: Mozilla
copyright: Copyright (c) Mozilla and others
license: mpl-2.0
license_text_file: mpl-2.0.LICENSE
'''

        # current date and time as an ISO time stamp string
        version = datetime.isoformat(datetime.utcnow()).partition('.')[0]
        glocals = dict(locals())
        glocals.update(globals())
        print('Fetching latest list from: %(PSL_URL)s on: %(version)s' % glocals)
        with closing(fetch()) as fetched:
            with codecs.open(PSL_FILE, 'wb', encoding='utf-8') as pslout:
                pslout.write(fetched.read())
        with open(ABOUT_PSL_FILE, 'wb') as about:
            about.write(ABOUT_TEMPLATE % glocals)
        print('Saved updated %(PSL_FILE)s and %(ABOUT_PSL_FILE)s' % glocals)
Example #15
0
def load_suffix_list():
    if SUFFIX_CACHE and os.path.exists(SUFFIX_CACHE):
        utils.debug("Using cached suffix list.", divider=True)
        cache_file = codecs.open(SUFFIX_CACHE, encoding='utf-8')
        suffixes = PublicSuffixList(cache_file)
    else:
        # File does not exist, download current list and cache it at given location.
        utils.debug("Downloading the Public Suffix List...", divider=True)
        try:
            cache_file = fetch()
        except URLError as err:
            logging.warn("Unable to download the Public Suffix List...")
            utils.debug("{}".format(err))
            return []
        content = cache_file.readlines()
        suffixes = PublicSuffixList(content)

        if SUFFIX_CACHE:
            utils.debug("Caching suffix list at %s" % SUFFIX_CACHE,
                        divider=True)
            utils.write(''.join(content), SUFFIX_CACHE)

    return suffixes
    parser = argparse.ArgumentParser()
    parser.add_argument('file', type=argparse.FileType('r'), nargs='+')
    parser.add_argument('-v', '--verbose', action='store_const', const=logging.INFO, dest='loglevel',
                        help='increase output verbosity.')
    parser.add_argument('-d', '--debug', action='store_const', const=logging.DEBUG, dest='loglevel',
                        default=logging.WARNING, help='show debug output (even more than -v).')

    args = parser.parse_args()

    logging.basicConfig(level=args.loglevel)
    #seperate two input files
    f1 =  args.file[0]
    f2 = args.file[1]

    #initialize dictionary and fetch public suffix list
    psl = PublicSuffixList(fetch())
    dictionary = {};

    #Grab domains from psl
    for line in f1.readlines():
        slvl = psl.get_public_suffix(line.strip())
        parts = slvl.split('.')
        dictionary[parts[0]] = 1
    
    #Check if word is in dictionary, if so print it
    for lines in f2.readlines():
        segments = lines.split('.')
        for name in segments:
            if dictionary.has_key(name):
                print lines 
                break 
Example #17
0
class UrlUtil:
    """封装一些关于url的操作"""
    psl_file = fetch(
    )  # 加载https://publicsuffix.org/list/public_suffix_list.dat
    # psl_file = codecs.open('./public_suffix_list.dat', encoding='utf8')
    psl = PublicSuffixList(psl_file)

    @classmethod
    def get_protocol(cls, url):
        """抽取url的协议"""
        parse_result = parse.urlparse(url=url)
        return parse_result[0].strip()  # 加上strip以防万一

    @classmethod
    def get_domain(cls, url):
        """抽取url的域名"""
        parse_result = parse.urlparse(url=url)
        return parse_result[1].strip()  # 有的链接域名最后跟了空白,chrome还能够正确的识别解析,神奇……

    @classmethod
    def get_top_domain(cls, url):
        """抽取url的一级域名"""
        domain = UrlUtil.get_domain(url)
        domain = domain.split(':')[0]  # 去掉端口
        ip_pattern = r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}"
        if re.match(ip_pattern, domain):
            return domain
        return cls.psl.get_public_suffix(domain)

    @classmethod
    def get_path(cls, url):
        """抽取url对应文件的路径(去掉文件本身)"""
        splites = url.split('/')
        if len(splites) == 3:
            return url
        elif len(splites) == 4 and splites[-1] == "":
            return url[:-1]
        return "/".join(url.split('/')[:-1])

    @classmethod
    def is_gov_or_edu(cls, url):
        """判断url是否是政府或教育机构域名"""
        domain = UrlUtil.get_domain(url)
        if len(domain) > 7 and domain[-7:] in (".gov.cn", ".edu.cn"):
            return True
        return False

    @classmethod
    def top_domain_is_gov_or_edu(cls, top_domain):
        """判断主域名是否是政府或教育机构"""
        if top_domain in ("gov.cn", "edu.cn"):
            return True
        return False

    @classmethod
    def get_url_suffix(cls, url):
        """获取网页后缀名(如html、js、css)"""
        path = urllib.parse.urlsplit(url)[2]
        if '.' not in path.split('/')[-1]:
            return ""
        return path.split('.')[-1]
Example #18
0
 def download_psl():
     fresh_psl = publicsuffix.fetch()
     with open(PublicSuffixListFilename, 'w', encoding='utf-8') as fresh_psl_file:
         fresh_psl_file.write(fresh_psl.read())
Example #19
0
 def get_suffix(self):
     suffix_list = fetch()
     psl = PublicSuffixList(suffix_list)
     return psl
Example #20
0
 def get_suffix(self):
     suffix_list = fetch()
     psl = PublicSuffixList(suffix_list)
     return psl
Example #21
0
class TestPublicSuffixLatest(TestPublicSuffixCurrent):
    """Test using the latest list"""
    psl = publicsuffix.fetch()
Example #22
0
 def test_fetch_amd_get_public_suffix(self):
     f = publicsuffix.fetch()
     psl = publicsuffix.PublicSuffixList(f)
     assert 'example.com' == psl.get_public_suffix('www.example.com')
     assert u('www.\u9999\u6e2f') == psl.get_public_suffix(u('www.\u9999\u6e2f'))