コード例 #1
0
ファイル: expert.py プロジェクト: motok/intelmq
 def init(self):
     if self.field not in ALLOWED_FIELDS:
         raise InvalidArgument('key',
                               got=self.field,
                               expected=ALLOWED_FIELDS)
     with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle:
         self.psl = PublicSuffixList(source=file_handle, only_icann=True)
コード例 #2
0
ファイル: test.py プロジェクト: meschansky/psl
    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)
コード例 #3
0
ファイル: login.py プロジェクト: Weasyl/weasyl
def is_email_blacklisted(address):
    """
    Determines if a supplied email address is present in the 'emailblacklist' table.
    Parameters:
        address: The email address to split out the domain from.
    Returns:
        Boolean True if present on the blacklist, or False otherwise.
    """
    _, domain = address.rsplit("@", 1)
    psl = PublicSuffixList()
    private_suffix = psl.privatesuffix(domain=domain)

    # Check the disposable email address list
    disposable_domains = _retrieve_disposable_email_domains()
    if private_suffix in disposable_domains:
        return True

    # Check the explicitly defined/blacklisted domains.
    blacklisted_domains = d.engine.execute("""
        SELECT domain_name
        FROM emailblacklist
    """).fetchall()
    for site in blacklisted_domains:
        if private_suffix == site['domain_name']:
            return True

    # If we get here, the domain (or subdomain) is not blacklisted
    return False
コード例 #4
0
def is_email_blacklisted(address):
    """
    Determines if a supplied email address is present in the 'emailblacklist' table.
    Parameters:
        address: The email address to split out the domain from.
    Returns:
        Boolean True if present on the blacklist, or False otherwise.
    """
    _, domain = address.rsplit("@", 1)
    psl = PublicSuffixList()
    private_suffix = psl.privatesuffix(domain=domain)

    # Check the disposable email address list
    disposable_domains = _retrieve_disposable_email_domains()
    if private_suffix in disposable_domains:
        return True

    # Check the explicitly defined/blacklisted domains.
    blacklisted_domains = d.engine.execute("""
        SELECT domain_name
        FROM emailblacklist
    """).fetchall()
    for site in blacklisted_domains:
        if private_suffix == site['domain_name']:
            return True

    # If we get here, the domain (or subdomain) is not blacklisted
    return False
コード例 #5
0
ファイル: test.py プロジェクト: mzpqnxow/psl
    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)
コード例 #6
0
ファイル: test.py プロジェクト: meschansky/psl
    def test_compatclass(self):

        from publicsuffixlist.compat import PublicSuffixList
        psl = PublicSuffixList()
        
        self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "")
        self.assertEqual(psl.get_public_suffix(""), "")
コード例 #7
0
ファイル: test.py プロジェクト: mzpqnxow/psl
    def test_compatclass(self):

        from publicsuffixlist.compat import PublicSuffixList
        psl = PublicSuffixList()

        self.assertEqual(psl.get_public_suffix("test.example.com"),
                         "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "")
        self.assertEqual(psl.get_public_suffix(""), "")
コード例 #8
0
def reduce_domain(domain_in):
    if not PublicSuffixList().publicsuffix(domain_in, accept_unknown=False):
        return None
    domain = PublicSuffixList().privatesuffix(domain_in)
    if domain:
        domain = domain.lower()
    else:
        log.debug("No eTLD for {}".format(domain))
    log.debug("Trimmed domain from {0} to {1}".format(domain_in, domain))
    return domain
コード例 #9
0
ファイル: spider.py プロジェクト: Catcherman/ark
 def _check_same_origin(self, current_url):
     '''
     检查两个URL是否同源
     '''
     current_url = to_unicode(current_url)
     url_part = urlparse.urlparse(current_url)
     #url_part_list=url_part.netloc.split('.')
     psl2 = PublicSuffixList()
     url_origin = psl2.privatesuffix(url_part.netloc)
     return url_origin == self.origin
コード例 #10
0
ファイル: WebEye.py プロジェクト: 10467106/WebEye
    def get_whois(self, name):
        try:
            domain = urlparse.urlparse(self.target).netloc

            # if domain is ip,stop querying domain.
            result1 = re.search("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain)
            if result1:
                return
            # remove port
            result2 = re.search("\:\d{1,5}$", domain)
            if result2:
                domain = domain.split(":")[0]

            # get domain's ip
            try:
                ip = socket.gethostbyname(domain)
                self.cms_list.add("IP:" + ip)
            except Exception, e:
                # print e
                pass


#            if re.match("^www\.",domain):
#                 domain = domain.strip("www.")
            psl = PublicSuffixList()
            domain = psl.suffix(domain)
            who = pythonwhois.get_whois(domain)

            # get whois
            if who["contacts"]["registrant"]["name"] is not None:
                self.cms_list.add(
                    "Domain_User:"******"contacts"]["registrant"]["name"].encode("utf8"))
            if who["contacts"]["registrant"]["email"] is not None:
                self.cms_list.add(
                    "Domain_Email:" +
                    who["contacts"]["registrant"]["email"].encode("utf8"))
            if who["contacts"]["registrant"]["phone"] is not None:
                self.cms_list.add(
                    "Domain_Phone:" +
                    who["contacts"]["registrant"]["phone"].encode("utf8"))
            if who["registrar"] is not None:
                self.cms_list.add("Domain_Registrar:" +
                                  who["registrar"][0].encode("utf8"))
            if who["nameservers"] is not None:
                name_servers = []
                for i in who["nameservers"]:
                    name_servers.append(i.encode('UTF8'))
                self.cms_list.add("Domai_name_servers:" +
                                  str(name_servers).encode("utf8"))
コード例 #11
0
def static_num(file_path):
    psl = PublicSuffixList()
    result = [0, 0, 0]
    with open(file_path, "r") as f:
        for r in f:
            d = r.strip().split(",")[0]
            d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".")
            if len(d_strip) == 1:
                result[0] += 1
            elif len(d_strip) == 2:
                result[1] += 1
            else:
                result[2] += 1
    print(result)
コード例 #12
0
ファイル: spider.py プロジェクト: Catcherman/ark
    def feed_url(self, url):
        '''
        设置初始爬取URL
        '''
        if isinstance(url, basestring):
            url = to_unicode(url)
            url = UrlData(url)

        if self.same_origin:
            url_part = urlparse.urlparse(unicode(url))
            psl = PublicSuffixList()
            self.origin = psl.privatesuffix(url_part.netloc)

        self.fetcher_queue.put(url, block=True)
コード例 #13
0
ファイル: commons.py プロジェクト: wirenic/Hallo
 def get_domain_name(url: str) -> str:
     """
     Gets the domain name of a URL, removing the TLD
     :param url: URL to find domain of
     """
     # Sanitise the URL, removing protocol and directories
     url = url.split("://")[-1]
     url = url.split("/")[0]
     url = url.split(":")[0]
     # Get the public suffix
     public_suffix = PublicSuffixList()
     url_tld = public_suffix.publicsuffix(url)
     # Else return the last part before the TLD
     return url[:-len(url_tld) - 1].split(".")[-1]
コード例 #14
0
def check_for_third_level_domains(filename):
    with open("public_suffix_list.dat", "r") as latest:
        psl = PublicSuffixList(latest)

    invalid = {
        line
        for line in files[filename] if len(psl.privateparts(line.strip())) > 1
    }
    if invalid:
        print(
            "The following domains contain a third or lower level domain in {!r}:"
            .format(filename))
        for line in sorted(invalid):
            print("* {}".format(line))
        sys.exit(1)
コード例 #15
0
def filter2LDAleax():
    psl=PublicSuffixList()
    data=[]
    with open("../data_sets/Aleax","r") as f:
        for r in f:
            d=r.strip()

            d1=d[:d.rindex(psl.publicsuffix(d))-1]
            if len(d1)==0:
                continue
            d_split=d1.split(".")
            if len(d_split)==1 and len(d_split[0])!=0:
                data.append(d)
        print(len(data))
    with open("../data_sets/Aleax2LD","w") as f:
        f.write("\n".join(data))
コード例 #16
0
ファイル: update.py プロジェクト: mzpqnxow/psl
def updatePSL(psl_file=PSLFILE):
    """ Updates a local copy of PSL file

    :param psl_file: path for the file to store the list. Default: PSLFILE
    """
    if requests is None:
        raise Exception("Please install python-requests http(s) library. $ sudo pip install requests")


    r = requests.get(PSLURL)
    if r.status_code != requests.codes.ok or len(r.content) == 0:
        raise Exception("Could not download PSL from " + PSLURL)

    lastmod = r.headers.get("last-modified", None)
    f = open(psl_file + ".swp", "wb")
    f.write(r.content)
    f.close()

    with open(psl_file + ".swp", "rb") as f:
        psl = PublicSuffixList(f)

    os.rename(psl_file + ".swp", psl_file)
    if lastmod:
        t = time.mktime(parsedate(lastmod))
        os.utime(psl_file, (t, t))

    print("PSL updated")
    if lastmod:
        print("last-modified: " + lastmod)
コード例 #17
0
 def getAllDomainLabels(self, domains):
     labels = []
     index = []
     psl = PublicSuffixList()
     for i in range(len(domains)):
         d = domains[i].strip()
         pub = psl.publicsuffix(d)
         d_split = d[:d.rindex(pub) - 1].split(".")
         if len(d_split) > 2:
             print("d:{} pub:{}".format(d, pub))
         for l in d_split:
             if len(l) == 0:
                 print("kong kong")
             labels.append(l)
             index.append(i)
     return labels, index
コード例 #18
0
def get_tld_esld(PSL, DOMAIN):
    # Outputs the pairs (TopLevelDomain, EffectiveSecondLevelDomain) for a
    # given domain (string) provided in input.
    if not isinstance(DOMAIN, string_types):
        tld, esld = None, None
    else:
        # remove '.' characters
        while DOMAIN.endswith("."):
            DOMAIN = DOMAIN[:-1]
        while DOMAIN.startswith("."):
            DOMAIN = DOMAIN[1:]
        if len(DOMAIN) == 0:
            tld, esld = None, None
        else:
            try:
                # information about TLDs
                tld = PSL.publicsuffix(DOMAIN)
            except Exception:
                tld = PublicSuffixList().publicsuffix(DOMAIN)
            if tld is None:
                esld = None
            else:
                if tld == DOMAIN:
                    esld = tld
                else:
                    # we obtain the ESLD by removing the TLD from 'DOMAIN'
                    udn = DOMAIN[:-len(tld) - 1]
                    # find the rightmost '.' and extract the ESLD
                    i = udn.rfind(".")
                    esld = udn[i + 1:] + '.' + tld
    return tld, esld
コード例 #19
0
def load_psl():
    global psl
    # Fetch PublicSuffix list and load it
    #    if not psl:
    print('Loading Public Suffix List')
    psl_file = pfetch()
    psl = PublicSuffixList(psl_file)
コード例 #20
0
def main(arguments):
    suffix_detected = False
    psl = None
    download_suffixes()
    with open("public_suffix_list.dat", "r") as latest:
        psl = PublicSuffixList(latest)
    with io.open('disposable_email_blacklist.conf', 'r') as deb:
        for i, line in enumerate(deb):
            current_line = line.strip()
            public_suffix = psl.publicsuffix(current_line)
            if public_suffix == current_line:
                print(f'The line number {i+1} contains just a public suffix: {current_line}')
                suffix_detected = True
    if suffix_detected:
        print ('At least one valid public suffix found in the blacklist, please remove it. See https://publicsuffix.org for details on why this shouldn\'t be blacklisted.')
        sys.exit(1)
コード例 #21
0
class DomainSuffixExpertBot(Bot):
    suffixes = {}

    def init(self):
        self.field = self.parameters.field
        if self.field not in ALLOWED_FIELDS:
            raise InvalidArgument('key',
                                  got=self.field,
                                  expected=ALLOWED_FIELDS)
        with codecs.open(self.parameters.suffix_file,
                         encoding='UTF-8') as file_handle:
            self.psl = PublicSuffixList(source=file_handle, only_icann=True)

    def process(self):
        event = self.receive_message()
        for space in ('source', 'destination'):
            key = '.'.join((space, self.field))
            if key not in event:
                continue
            event['.'.join(
                (space,
                 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key])

        self.send_message(event)
        self.acknowledge_message()
コード例 #22
0
ファイル: DomainFilter.py プロジェクト: yandingkui/Pontus
class Filter():
    def __init__(self):
        self.psl= PublicSuffixList(accept_unknown=False)
        self.sf = SingleFilter(100000, self.psl)

    def isValidDomain(self,domain:str):
        if (self.sf.isValidDomain(domain) and (not self.sf.inWhiteList(domain))):
            return True
        else:
            return False

    def Two_Three_level_domain(self,domain:str):
        """
        identify a domain
        :param domain:  domain:str
        :return: bool
        """
        publicsuffix=self.psl.publicsuffix(domain)
        if publicsuffix==None:
            return False
        pre_domain=domain[:domain.rindex(publicsuffix)-1]
        if len(pre_domain)==0:
            return False
        pre_domain_array=pre_domain.split(".")
        length=len(pre_domain_array)
        if length==2 or length==1:
            return True
        else:
            return False
コード例 #23
0
ファイル: apply_domain_suffix.py プロジェクト: motok/intelmq
def eventdb_apply(host, port, database, username, password, table, dry_run,
                  where, filename):
    if password:
        password = input('Password for user %r on %r: ' % (username, host))
    where = 'AND ' + where if where else ''

    con1 = psycopg2.connect(user=username,
                            password=password,
                            database=database,
                            host=host,
                            port=port)
    cur1 = con1.cursor(cursor_factory=DictCursor)
    con2 = psycopg2.connect(user=username,
                            password=password,
                            database=database,
                            host=host,
                            port=port)
    con2.autocommit = True
    cur2 = con2.cursor(cursor_factory=DictCursor)
    cur1.execute('''
                 SELECT id, "source.fqdn", "destination.fqdn"
                 FROM {table}
                 WHERE
                 ("source.fqdn" IS NOT NULL OR "destination.fqdn" IS NOT NULL)
                 {where}
                 '''.format(table=table, where=where))

    psl = PublicSuffixList(only_icann=True)

    counter = 0
    for row in cur1:
        counter += 1
        if row['source.fqdn']:
            cur2.execute(
                'update events set "source.domain_suffix" = %s where id = %s',
                (psl.publicsuffix(
                    row['source.fqdn'].encode('idna').decode()), row['id']))

        if row['destination.fqdn']:
            cur2.execute(
                'update events set "destination.domain_suffix" = %s where id = %s',
                (psl.publicsuffix(
                    row['destination.fqdn'].encode('idna').decode()),
                 row['id']))
    con2.commit()
    print("Changed %d rows" % counter)
コード例 #24
0
def get2subdomain(root_dir="/home/public/2019-01-07-dgarchive_full"):
    result = dict()
    psl = PublicSuffixList()
    for filename in os.listdir(root_dir):
        with open("{}/{}".format(root_dir, filename), "r") as f:
            for r in f:
                d = r.strip().split(",")[0]
                d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".")
                if len(d_strip) == 2:
                    domains = result.get(filename)
                    if domains is None:
                        domains = set()
                        result[filename] = domains
                    domains.add(d)

    for k, v in result.items():
        print("{} : {}".format(k, len(v)))
        v_list = list(v)
        print(v_list[:10])
コード例 #25
0
def decompose_filter(inputstring, psl=PublicSuffixList()):
    logging.debug(f'Parsing "{inputstring}"')
    try:
        match_list = []
        querystring = inputstring
        # Clean input
        querystring = re.sub(r'(?i)[^-a-z0-9.%_]', '',
                             querystring).strip('. ').lower()
        logging.debug(f'Cleaned input to "{querystring}"')
        if '_' in querystring:
            logging.error(
                f'Single character wildcards are not handled yet. "{querystring}"'
            )
        if querystring.count('%') == 0:
            ts_q1 = querystring
            ts_q2 = querystring
        else:
            # Check for usable strings at the start of the string
            leading_match = re.search(
                r'^(?P<q_lead>[-a-z0-9.]+)(?:[%_.]*[%_])', querystring)
            if leading_match:
                match_list.append(leading_match.group('q_lead') + ':*')
            # Check for usable strings in the middle of the string
            mid_match_list = re.findall(
                r'(?<=[%_]\.)(?P<q_mid>[-a-z0-9.]+)(?:[%_.]*[%_])',
                querystring)
            if mid_match_list:
                mid_match_list = [m + ':*' for m in mid_match_list]
                match_list.extend(mid_match_list)
            # Check for usable strings at the end of the string
            trailing_match = re.search(
                r'(?<=[%_]\.)(?P<q_trail>[-a-z0-9.]+[-a-z0-9])$', querystring)
            if trailing_match:
                if psl.is_private(trailing_match.group('q_trail')):
                    match_list.append(trailing_match.group('q_trail'))
            if match_list:
                match_list = list(set(match_list))
                match_list.sort(key=lambda x: len(x.lstrip('w').rstrip(':*')),
                                reverse=True)
                ts_long_list = match_list[:2]
                ts_q1 = ts_long_list[0]
                ts_q2 = ts_long_list[-1]
            else:
                logging.error(
                    f'Could not extract usable querystring on "{inputstring}"')
                return
    except Exception as e:
        logging.error(f'Error on "{inputstring}", "{e}"')
        return
    return_dict = {
        'querystring': querystring,
        'ts_q1': ts_q1,
        'ts_q2': ts_q2,
    }
    return return_dict
コード例 #26
0
def check_for_public_suffixes(filename):
    lines = files[filename]
    suffix_detected = False
    psl = None
    with open("public_suffix_list.dat", "r") as latest:
        psl = PublicSuffixList(latest)
    for i, line in enumerate(lines):
        current_line = line.strip()
        public_suffix = psl.publicsuffix(current_line)
        if public_suffix == current_line:
            print(
                f"The line number {i+1} contains just a public suffix: {current_line}"
            )
            suffix_detected = True
    if suffix_detected:
        print(
            "At least one valid public suffix found in {!r}, please "
            "remove it. See https://publicsuffix.org for details on why this "
            "shouldn't be blocklisted.".format(filename))
        sys.exit(1)
コード例 #27
0
def lstm_getSingleFea(d: str):
    psl = PublicSuffixList()
    d = d[:d.rindex(psl.publicsuffix(d)) - 1].replace(".", "")
    vector = np.zeros(64)
    if (len(d) == 0):
        return vector
    cuter = CutWords()
    # wordlist = cuter.max_forward_cut(d)
    # wordlist = cuter.max_backward_cut(d)
    wordlist = cuter.max_biward_cut(d)

    vi = 63
    for i in range(len(wordlist) - 1, -1, -1):
        vector[vi] = CutWords.order[wordlist[i]]
        vi = vi - 1
        if (vi < 0):
            break
    # print(d)
    # print(vector)
    return vector
コード例 #28
0
ファイル: expert.py プロジェクト: motok/intelmq
 def check(parameters):
     if not os.path.exists(parameters.get('suffix_file', '')):
         return [[
             "error",
             "File given as parameter 'suffix_file' does not exist."
         ]]
     try:
         with codecs.open(parameters['suffix_file'],
                          encoding='UTF-8') as database:
             PublicSuffixList(source=database, only_icann=True)
     except Exception as exc:
         return [["error", "Error reading database: %r." % exc]]
コード例 #29
0
ファイル: login.py プロジェクト: guptaarth87/weasyl
def is_email_blacklisted(address):
    """
    Determines if a supplied email address is present in the 'emailblacklist' table.
    Parameters:
        address: The email address to split out the domain from.
    Returns:
        Boolean True if present on the blacklist, or False otherwise.
    """
    _, domain = address.rsplit("@", 1)
    psl = PublicSuffixList()
    private_suffix = psl.privatesuffix(domain=domain)

    # Check the disposable email address list
    if private_suffix in DISPOSABLE_DOMAINS:
        return True

    # Check the explicitly defined/blacklisted domains.
    return d.engine.scalar(
        "SELECT EXISTS (SELECT FROM emailblacklist WHERE domain_name = %(domain)s)",
        domain=private_suffix,
    )
    def init(self, config=None):

        if config is not None:
            self.config = load_config_obj(config)

        app = Flask('product_identifier')
        app.config.from_object(self.config)
        self.__flask = app
        self.__db = SQLAlchemy(self.__flask)
        Migrate(self.__flask, self.db)

        self.handler_pool = gevent.pool.Pool(
            self.config.MASTER_HANDLER_POOL_SIZE)

        with open(os.path.join(self.config.DATA_DIR, "ruleset.json"),
                  "r") as f:
            rule_set = json.load(f)
            self.product_patterns = []
            for name, pattern in rule_set["rules"].iteritems():
                self.product_patterns.append(re.compile(pattern))

        self.__psl = PublicSuffixList()
コード例 #31
0
ファイル: test.py プロジェクト: mzpqnxow/psl
    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"),
                         "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
コード例 #32
0
def createdataset(type="train",
                  AGD_file="../data_sets/split_AGDs",
                  BD_file="../data_sets/split_benign_nx.json",
                  datasetname="nx_train_data"):
    if type == "train":
        v_index = 0
    else:
        v_index = 1
    psl = PublicSuffixList()
    with open(AGD_file, "r") as f:
        AGD_dict = json.loads(f.read())
    with open(BD_file, "r") as f:
        bd_dict = json.loads(f.read())
    allAGDs = set()
    allBDs = set()
    for k, v in AGD_dict.items():
        for d in v[v_index]:
            pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1]
            for l in pre_d.split("."):
                allAGDs.add(l)
    for d in bd_dict[type]:
        pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1]
        for l in pre_d.split("."):
            allBDs.add(l)
    length = len(allAGDs)
    print(length)
    allBDs = list(allBDs)[:length]
    allAGDs = list(allAGDs)
    alldomains = allAGDs + allBDs
    alllabels = list(np.ones(length)) + list(np.zeros(length))
    allfeatures = extract_all_features(alldomains)
    np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures)
    data = dict()
    data["domains"] = pd.Series(alldomains, dtype='str')
    data["labels"] = pd.Series(alllabels, dtype='int32')
    df = pd.DataFrame(data=data)
    df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
コード例 #33
0
def static_1_2(root_dir="/home/public/2019-01-07-dgarchive_full"):
    psl=PublicSuffixList()
    result=dict()
    for filename in os.listdir(root_dir):
        df = pd.read_csv(os.path.join(root_dir,filename),header=None,error_bad_lines=False)
        domains = result.get(filename)
        if domains is None:
            domains = [set(), set()]
            result[filename] = domains
        for d in df.iloc[:,0]:
            pub_d=psl.publicsuffix(d)
            if d != pub_d:
                d_split=d[:d.rindex(pub_d)-1].split(".")
                if len(d_split)==1:
                    result.get(filename)[0].add(d)
                elif len(d_split)==2:
                    result.get(filename)[1].add(d)
                else:
                    print("Wow : {}".format(d))
        print("{} finish".format(filename))

    print("write")
    with open("../result_data/dga_data.json","w") as f:
        f.write(json.dumps(result,cls=MyJsonEncoder))
コード例 #34
0
def dga_static_num(file_path):
    psl = PublicSuffixList()
    result = [0, 0, 0]
    with open(file_path, "r") as f:
        map = json.loads(f.read())
    for k, v in map.items():
        for d in v[0]:

            d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".")
            if len(d_strip) == 1:
                result[0] += 1
            elif len(d_strip) == 2:
                result[1] += 1
            else:
                result[2] += 1
        for d in v[1]:
            d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".")
            if len(d_strip) == 1:
                result[0] += 1
            elif len(d_strip) == 2:
                result[1] += 1
            else:
                result[2] += 1
    print(result)
コード例 #35
0
ファイル: yd_benign.py プロジェクト: yandingkui/Pontus
def getBenign(filepath):
    psl=PublicSuffixList()
    filter=Filter()
    domains=[]

    # out=dict()
    # with open(filepath,"r") as f:
    #     for r in f:
    #         r_split=r.strip().split(":")
    #         if filter.inWhiteList(r_split[0]):
    #             pri=psl.privatesuffix(r_split[0])
    #             lll=out.get(pri)
    #             if lll is None:
    #                 lll=[]
    #             lll.append(r_split[0])
    #             out[pri]=lll
    #             continue
    #         domains.append(r_split[0])
    #
    #
    # num=0
    # break_flag=False
    # for i in range(9):
    #     for k,v in out.items():
    #         if i>=len(v) or k in ["aliyunduncc.com","360wzb.cn","yundunwaf.com","bugtags.com","wscloudcdn.com","ourdvsss.com","aliyundunwaf.com","aligfwaf.com"]:
    #             continue
    #         domains.append(v[i])
    #         num+=1
    #         if num>=311:
    #             break_flag=True
    #             break
    #     if break_flag:
    #         break

    with open(filepath,"r") as f:
        for r in f:
            r_split=r.strip().split(":")
            domains.append(r_split[0])
    random.shuffle(domains)

    result=dict()
    result["train"]=domains[:23600]
    result["pred"]=domains[23600:29500]

    with open("../result_data/yd_nf_data.json","w") as f:
        f.write(json.dumps(result))

    print(len(domains))
コード例 #36
0
ファイル: test.py プロジェクト: meschansky/psl
    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
コード例 #37
0
ファイル: http_basic.py プロジェクト: rurbin3/yawast
def check_hsts_preload(url: str) -> List[dict]:
    hsts_service = "https://hstspreload.com/api/v1/status/"
    results: List[dict] = []

    domain = utils.get_domain(url)

    if not checkers.is_ip_address(domain):
        while domain.count(".") > 0:
            # get the HSTS preload status for the domain
            res, _ = network.http_json(f"{hsts_service}{domain}")
            results.append(res)

            domain = domain.split(".", 1)[-1]
            if PublicSuffixList().is_public(domain):
                break

    return results
コード例 #38
0
ファイル: expert.py プロジェクト: CZ-NIC/intelmq
class DomainSuffixExpertBot(Bot):
    suffixes = {}

    def init(self):
        self.field = self.parameters.field
        if self.field not in ALLOWED_FIELDS:
            raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS)
        with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle:
            self.psl = PublicSuffixList(source=file_handle, only_icann=True)

    def process(self):
        event = self.receive_message()
        for space in ('source', 'destination'):
            key = '.'.join((space, self.field))
            if key not in event:
                continue
            event['.'.join((space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key])

        self.send_message(event)
        self.acknowledge_message()
コード例 #39
0
ファイル: master.py プロジェクト: mozilla/product_identifier
    def init(self, config=None):

        if config is not None:
            self.config = load_config_obj(config)

        app = Flask('product_identifier')
        app.config.from_object(self.config)
        self.__flask = app
        self.__db = SQLAlchemy(self.__flask)
        Migrate(self.__flask, self.db)

        self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE)

        with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f:
            rule_set = json.load(f)
            self.product_patterns = []
            for name, pattern in rule_set["rules"].iteritems():
                self.product_patterns.append(re.compile(pattern))

        self.__psl = PublicSuffixList()
コード例 #40
0
ファイル: master.py プロジェクト: mozilla/product_identifier
class Master(BaseApplication):

    __flask = None

    @property
    def flask(self):
        if not self.__flask:
            raise ApplicationInitError("Cannot obtain server instance before init")
        return self.__flask

    @property
    def db(self):
        return self.__db

    def init(self, config=None):

        if config is not None:
            self.config = load_config_obj(config)

        app = Flask('product_identifier')
        app.config.from_object(self.config)
        self.__flask = app
        self.__db = SQLAlchemy(self.__flask)
        Migrate(self.__flask, self.db)

        self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE)

        with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f:
            rule_set = json.load(f)
            self.product_patterns = []
            for name, pattern in rule_set["rules"].iteritems():
                self.product_patterns.append(re.compile(pattern))

        self.__psl = PublicSuffixList()

    def start(self):
        def handleURL():
            from product_identifier.models import URL
            db = Master.instance().db
            self.flask.logger.debug("JOB STARTED")
            while True:
                try:
                    # TODO: succeptible to concurrency problems
                    in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET])
                    if in_url:
                        if not self.redis.sismember(PROCESSED_URLS_SET, in_url):
                            self.flask.logger.debug("PROCESSING: {}".format(in_url))
                            uri = furl(in_url)
                            domain = self.__psl.suffix(uri.host)

                            try:
                                p = URL()
                                p.domain = domain
                                p.url = in_url
                                p.is_product = self.is_product_url(in_url)
                                db.session.add(p)
                                db.session.commit()
                            except:
                                db.session.rollback()
                                error = traceback.format_exc()
                                self.flask.logger.error("DB_ERROR: {}".format(error))
                                self.redis.sadd(DB_ERRORED_URL_SET, in_url)

                            self.redis.sadd(PROCESSED_URLS_SET, in_url)

                            self.redis.rpush(domain, in_url)
                            domain_added = self.redis.sadd(DOMAINS_SET, domain)
                            if domain_added:
                                self.flask.logger.info("ADDED DOMAIN: {}".format(domain))
                        else:
                            self.flask.logger.debug("SKIPPING: {}".format(in_url))
                    else:
                        # no results, sleep
                        gevent.sleep(1)
                except:
                    error = traceback.format_exc()
                    self.flask.logger.error("ERROR: {}".format(error))

        for i in range(self.config.MASTER_HANDLER_POOL_SIZE):
            self.handler_pool.spawn(handleURL)

    def is_product_url(self, url):
        print url
        return any([pat.match(url) for pat in self.product_patterns])
コード例 #41
0
ファイル: test.py プロジェクト: meschansky/psl
    def setUp(self):

        self.psl = PublicSuffixList()
コード例 #42
0
ファイル: test.py プロジェクト: meschansky/psl
class TestPSL(unittest.TestCase):

    def setUp(self):

        self.psl = PublicSuffixList()
        

    def test_typesafe(self):
        self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__)
        self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__)

        self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__)
        self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__)


    def test_uppercase(self):
        self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp")
        self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp")


    def test_invaliddomain(self):
        self.assertEqual(self.psl.suffix("www..invalid"), None)
        self.assertEqual(self.psl.suffix(".example.com"), None)
        self.assertEqual(self.psl.suffix("example.com."), None)
        self.assertEqual(self.psl.suffix(""), None)

        self.assertEqual(self.psl.publicsuffix("www..invalid"), None)
        self.assertEqual(self.psl.publicsuffix(".example.com"), None)
        self.assertEqual(self.psl.publicsuffix("example.com."), None)
        self.assertEqual(self.psl.publicsuffix(""), None)


    def test_idn(self):
        tld = u("香港")
        self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)

    
    def test_punycoded(self):
        tld = encode_idn(u("香港"))
        self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld)
        self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld)


    def test_suffix_deny_public(self):
        self.assertEqual(self.psl.suffix("com"), None)
        self.assertEqual(self.psl.suffix("co.jp"), None)
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)


    def test_unknown(self):
        self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld")
        self.assertEqual(self.psl.suffix("unknowntld"), None)

        self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")


    def test_deny_unknown(self):
        source = """
known
"""
        psl = PublicSuffixList(source.splitlines(), accept_unknown=False)

        self.assertEqual(psl.suffix("www.example.unknowntld"), None)


    def test_custom_psl(self):
        source = """
invalid
*.invalid
!test.invalid
"""
        psl = PublicSuffixList(source.splitlines())

        self.assertEqual(psl.suffix("example.invalid"), None)
        self.assertEqual(psl.suffix("test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid")
        self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid")

        self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid")
        self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")




    def test_publicsuffix(self):
        self.assertEqual(self.psl.publicsuffix("www.example.com"), "com")
        self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld")

    def test_wildcard(self):
        self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp")
        self.assertEqual(self.psl.suffix("example.nagoya.jp"), None)
        self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp")
        self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp")



    def test_checkpublicsuffix_script(self):
        regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);")
        with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f:
            ln = 0
        
            for line in f:
                ln += 1
                l = line.decode("utf-8")
                m = regex.match(l)
                if not m:
                    continue
    
                arg = m.group(1).strip("'")
                res = None if m.group(2) == "null" else m.group(2).strip("'")
                
                self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip()))
            


    def test_typeerror(self):

        self.assertRaises(TypeError, lambda: self.psl.suffix(None))
        self.assertRaises(TypeError, lambda: self.psl.suffix(1))
        if b("") != "":
            # python3
            self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com")))
        

    def test_compatclass(self):

        from publicsuffixlist.compat import PublicSuffixList
        psl = PublicSuffixList()
        
        self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "")
        self.assertEqual(psl.get_public_suffix(""), "")

    def test_unsafecompatclass(self):

        from publicsuffixlist.compat import UnsafePublicSuffixList
        psl = UnsafePublicSuffixList()
        
        self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com")
        self.assertEqual(psl.get_public_suffix("com"), "com")
        self.assertEqual(psl.get_public_suffix(""), "")


    def test_toomanylabels(self):
        d = "a." * 1000000 + "example.com"

        self.assertEqual(self.psl.publicsuffix(d), "com")
        self.assertEqual(self.psl.privatesuffix(d), "example.com")


    def test_flatstring(self):
        psl = PublicSuffixList(u("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")

    def test_flatbytestring(self):
        psl = PublicSuffixList(b("com\nnet\n"))
        self.assertEqual(psl.publicsuffix("example.com"), "com")
コード例 #43
0
ファイル: test.py プロジェクト: meschansky/psl
 def test_flatbytestring(self):
     psl = PublicSuffixList(b("com\nnet\n"))
     self.assertEqual(psl.publicsuffix("example.com"), "com")
コード例 #44
0
ファイル: expert.py プロジェクト: CZ-NIC/intelmq
 def init(self):
     self.field = self.parameters.field
     if self.field not in ALLOWED_FIELDS:
         raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS)
     with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle:
         self.psl = PublicSuffixList(source=file_handle, only_icann=True)
コード例 #45
0
def subresource_integrity(reqs: dict, expectation='sri-implemented-and-external-scripts-loaded-securely') -> dict:
    """
    :param reqs: dictionary containing all the request and response objects
    :param expectation: test expectation
        sri-implemented-and-all-scripts-loaded-securely: all same origin, and uses SRI
        sri-implemented-and-external-scripts-loaded-securely: integrity attribute exists on all external scripts,
          and scripts loaded [default for HTML]
        sri-implemented-but-external-scripts-not-loaded-securely: SRI implemented, but with scripts loaded over HTTP
        sri-not-implemented-but-external-scripts-loaded-securely: SRI isn't implemented,
          but all scripts are loaded over HTTPS
        sri-not-implemented-and-external-scripts-not-loaded-securely: SRI isn't implemented,
          and scripts are downloaded over HTTP
        sri-not-implemented-but-all-scripts-loaded-from-secure-origin: SRI isn't implemented,
          but all scripts come from secure origins (self)
        sri-not-implemented-but-no-scripts-loaded: SRI isn't implemented, because the page doesn't load any scripts
        sri-not-implemented-response-not-html: SRI isn't needed, because the page isn't HTML [default for non-HTML]
        request-did-not-return-status-code-200: Only look for SRI on pages that returned 200, not things like 404s
        html-not-parsable: Can't parse the page's content
    :return: dictionary with:
        data: all external scripts and their integrity / crossorigin attributes
        expectation: test expectation
        pass: whether the site's external scripts met expectations
        result: short string describing the result of the test
    """
    output = {
        'data': {},
        'expectation': expectation,
        'pass': False,
        'result': None,
    }
    response = reqs['responses']['auto']

    # The order of how "good" the results are
    goodness = ['sri-implemented-and-all-scripts-loaded-securely',
                'sri-implemented-and-external-scripts-loaded-securely',
                'sri-implemented-but-external-scripts-not-loaded-securely',
                'sri-not-implemented-but-external-scripts-loaded-securely',
                'sri-not-implemented-and-external-scripts-not-loaded-securely',
                'sri-not-implemented-response-not-html']

    # If the response to get / fails
    if response.status_code != 200:
        output['result'] = 'request-did-not-return-status-code-200'

    # If the content isn't HTML, there's no scripts to load; this is okay
    elif response.headers.get('Content-Type', '').split(';')[0] not in ('text/html', 'application/xhtml+xml'):
        output['result'] = 'sri-not-implemented-response-not-html'

    else:
        # Try to parse the HTML
        try:
            soup = bs(reqs['resources']['/'], 'html.parser')
        except:
            output['result'] = 'html-not-parsable'
            return output

        # Track to see if any scripts were on foreign TLDs
        scripts_on_foreign_origin = False

        # Get all the scripts
        scripts = soup.find_all('script')
        for script in scripts:
            if script.has_attr('src'):
                # Script tag parameters
                src = urlparse(script['src'])
                integrity = script.get('integrity')
                crossorigin = script.get('crossorigin')

                # Check to see if they're on the same second-level domain
                # TODO: update the PSL list on startup
                psl = PublicSuffixList()
                samesld = True if (psl.privatesuffix(urlparse(response.url).netloc) ==
                                   psl.privatesuffix(src.netloc)) else False

                # Check to see if it's the same origin or second-level domain
                if src.netloc == '' or samesld:
                    secureorigin = True
                elif src.netloc != '' and '.' not in src.netloc:  # like localhost
                    secureorigin = False
                    scripts_on_foreign_origin = True
                else:
                    secureorigin = False
                    scripts_on_foreign_origin = True

                # See if it's a secure scheme
                if src.scheme == 'https' or (src.scheme == '' and urlparse(response.url).scheme == 'https'):
                    securescheme = True
                else:
                    securescheme = False

                # Add it to the scripts data result, if it's not a relative URI
                if not secureorigin:
                    output['data'][script['src']] = {
                                                        'crossorigin': crossorigin,
                                                        'integrity': integrity
                                                    }

                    if integrity and not securescheme:
                        output['result'] = only_if_worse('sri-implemented-but-external-scripts-not-loaded-securely',
                                                         output['result'],
                                                         goodness)
                    elif not integrity and securescheme:
                        output['result'] = only_if_worse('sri-not-implemented-but-external-scripts-loaded-securely',
                                                         output['result'],
                                                         goodness)
                    elif not integrity and not securescheme:
                        output['result'] = only_if_worse('sri-not-implemented-and-external-scripts'
                                                         '-not-loaded-securely',
                                                         output['result'],
                                                         goodness)

                # Grant bonus even if they use SRI on the same origin
                else:
                    if integrity and securescheme and not output['result']:
                        output['result'] = 'sri-implemented-and-all-scripts-loaded-securely'

        # If the page doesn't load any scripts
        if not scripts:
            output['result'] = 'sri-not-implemented-but-no-scripts-loaded'

        # If all the scripts are loaded from a secure origin, not triggering a need for SRI
        elif scripts and not scripts_on_foreign_origin and not output['result']:
            output['result'] = 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin'

        # If the page loaded from a foreign origin, but everything included SRI
        elif scripts and scripts_on_foreign_origin and not output['result']:
            output['result'] = only_if_worse('sri-implemented-and-external-scripts-loaded-securely',
                                             output['result'],
                                             goodness)

    # Code defensively on the size of the data
    output['data'] = output['data'] if len(str(output['data'])) < 32768 else {}

    # Check to see if the test passed or failed
    if output['result'] in ('sri-implemented-and-all-scripts-loaded-securely',
                            'sri-implemented-and-external-scripts-loaded-securely',
                            'sri-not-implemented-response-not-html',
                            'sri-not-implemented-but-all-scripts-loaded-from-secure-origin',
                            'sri-not-implemented-but-no-scripts-loaded',
                            expectation):
        output['pass'] = True

    return output