def reduce_domain(domain_in): if not PublicSuffixList().publicsuffix(domain_in, accept_unknown=False): return None domain = PublicSuffixList().privatesuffix(domain_in) if domain: domain = domain.lower() else: log.debug("No eTLD for {}".format(domain)) log.debug("Trimmed domain from {0} to {1}".format(domain_in, domain)) return domain
def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None)
def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list disposable_domains = _retrieve_disposable_email_domains() if private_suffix in disposable_domains: return True # Check the explicitly defined/blacklisted domains. blacklisted_domains = d.engine.execute(""" SELECT domain_name FROM emailblacklist """).fetchall() for site in blacklisted_domains: if private_suffix == site['domain_name']: return True # If we get here, the domain (or subdomain) is not blacklisted return False
def init(self): if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True)
def load_psl(): global psl # Fetch PublicSuffix list and load it # if not psl: print('Loading Public Suffix List') psl_file = pfetch() psl = PublicSuffixList(psl_file)
def updatePSL(psl_file=PSLFILE): """ Updates a local copy of PSL file :param psl_file: path for the file to store the list. Default: PSLFILE """ if requests is None: raise Exception("Please install python-requests http(s) library. $ sudo pip install requests") r = requests.get(PSLURL) if r.status_code != requests.codes.ok or len(r.content) == 0: raise Exception("Could not download PSL from " + PSLURL) lastmod = r.headers.get("last-modified", None) f = open(psl_file + ".swp", "wb") f.write(r.content) f.close() with open(psl_file + ".swp", "rb") as f: psl = PublicSuffixList(f) os.rename(psl_file + ".swp", psl_file) if lastmod: t = time.mktime(parsedate(lastmod)) os.utime(psl_file, (t, t)) print("PSL updated") if lastmod: print("last-modified: " + lastmod)
def get_tld_esld(PSL, DOMAIN): # Outputs the pairs (TopLevelDomain, EffectiveSecondLevelDomain) for a # given domain (string) provided in input. if not isinstance(DOMAIN, string_types): tld, esld = None, None else: # remove '.' characters while DOMAIN.endswith("."): DOMAIN = DOMAIN[:-1] while DOMAIN.startswith("."): DOMAIN = DOMAIN[1:] if len(DOMAIN) == 0: tld, esld = None, None else: try: # information about TLDs tld = PSL.publicsuffix(DOMAIN) except Exception: tld = PublicSuffixList().publicsuffix(DOMAIN) if tld is None: esld = None else: if tld == DOMAIN: esld = tld else: # we obtain the ESLD by removing the TLD from 'DOMAIN' udn = DOMAIN[:-len(tld) - 1] # find the rightmost '.' and extract the ESLD i = udn.rfind(".") esld = udn[i + 1:] + '.' + tld return tld, esld
def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "")
def decompose_filter(inputstring, psl=PublicSuffixList()): logging.debug(f'Parsing "{inputstring}"') try: match_list = [] querystring = inputstring # Clean input querystring = re.sub(r'(?i)[^-a-z0-9.%_]', '', querystring).strip('. ').lower() logging.debug(f'Cleaned input to "{querystring}"') if '_' in querystring: logging.error( f'Single character wildcards are not handled yet. "{querystring}"' ) if querystring.count('%') == 0: ts_q1 = querystring ts_q2 = querystring else: # Check for usable strings at the start of the string leading_match = re.search( r'^(?P<q_lead>[-a-z0-9.]+)(?:[%_.]*[%_])', querystring) if leading_match: match_list.append(leading_match.group('q_lead') + ':*') # Check for usable strings in the middle of the string mid_match_list = re.findall( r'(?<=[%_]\.)(?P<q_mid>[-a-z0-9.]+)(?:[%_.]*[%_])', querystring) if mid_match_list: mid_match_list = [m + ':*' for m in mid_match_list] match_list.extend(mid_match_list) # Check for usable strings at the end of the string trailing_match = re.search( r'(?<=[%_]\.)(?P<q_trail>[-a-z0-9.]+[-a-z0-9])$', querystring) if trailing_match: if psl.is_private(trailing_match.group('q_trail')): match_list.append(trailing_match.group('q_trail')) if match_list: match_list = list(set(match_list)) match_list.sort(key=lambda x: len(x.lstrip('w').rstrip(':*')), reverse=True) ts_long_list = match_list[:2] ts_q1 = ts_long_list[0] ts_q2 = ts_long_list[-1] else: logging.error( f'Could not extract usable querystring on "{inputstring}"') return except Exception as e: logging.error(f'Error on "{inputstring}", "{e}"') return return_dict = { 'querystring': querystring, 'ts_q1': ts_q1, 'ts_q2': ts_q2, } return return_dict
def check(parameters): if not os.path.exists(parameters.get('suffix_file', '')): return [[ "error", "File given as parameter 'suffix_file' does not exist." ]] try: with codecs.open(parameters['suffix_file'], encoding='UTF-8') as database: PublicSuffixList(source=database, only_icann=True) except Exception as exc: return [["error", "Error reading database: %r." % exc]]
def get_domain_name(url: str) -> str: """ Gets the domain name of a URL, removing the TLD :param url: URL to find domain of """ # Sanitise the URL, removing protocol and directories url = url.split("://")[-1] url = url.split("/")[0] url = url.split(":")[0] # Get the public suffix public_suffix = PublicSuffixList() url_tld = public_suffix.publicsuffix(url) # Else return the last part before the TLD return url[:-len(url_tld) - 1].split(".")[-1]
def static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def get_whois(self, name): try: domain = urlparse.urlparse(self.target).netloc # if domain is ip,stop querying domain. result1 = re.search("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain) if result1: return # remove port result2 = re.search("\:\d{1,5}$", domain) if result2: domain = domain.split(":")[0] # get domain's ip try: ip = socket.gethostbyname(domain) self.cms_list.add("IP:" + ip) except Exception, e: # print e pass # if re.match("^www\.",domain): # domain = domain.strip("www.") psl = PublicSuffixList() domain = psl.suffix(domain) who = pythonwhois.get_whois(domain) # get whois if who["contacts"]["registrant"]["name"] is not None: self.cms_list.add( "Domain_User:"******"contacts"]["registrant"]["name"].encode("utf8")) if who["contacts"]["registrant"]["email"] is not None: self.cms_list.add( "Domain_Email:" + who["contacts"]["registrant"]["email"].encode("utf8")) if who["contacts"]["registrant"]["phone"] is not None: self.cms_list.add( "Domain_Phone:" + who["contacts"]["registrant"]["phone"].encode("utf8")) if who["registrar"] is not None: self.cms_list.add("Domain_Registrar:" + who["registrar"][0].encode("utf8")) if who["nameservers"] is not None: name_servers = [] for i in who["nameservers"]: name_servers.append(i.encode('UTF8')) self.cms_list.add("Domai_name_servers:" + str(name_servers).encode("utf8"))
def check_for_third_level_domains(filename): with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) invalid = { line for line in files[filename] if len(psl.privateparts(line.strip())) > 1 } if invalid: print( "The following domains contain a third or lower level domain in {!r}:" .format(filename)) for line in sorted(invalid): print("* {}".format(line)) sys.exit(1)
def getBenign(filepath): psl=PublicSuffixList() filter=Filter() domains=[] # out=dict() # with open(filepath,"r") as f: # for r in f: # r_split=r.strip().split(":") # if filter.inWhiteList(r_split[0]): # pri=psl.privatesuffix(r_split[0]) # lll=out.get(pri) # if lll is None: # lll=[] # lll.append(r_split[0]) # out[pri]=lll # continue # domains.append(r_split[0]) # # # num=0 # break_flag=False # for i in range(9): # for k,v in out.items(): # if i>=len(v) or k in ["aliyunduncc.com","360wzb.cn","yundunwaf.com","bugtags.com","wscloudcdn.com","ourdvsss.com","aliyundunwaf.com","aligfwaf.com"]: # continue # domains.append(v[i]) # num+=1 # if num>=311: # break_flag=True # break # if break_flag: # break with open(filepath,"r") as f: for r in f: r_split=r.strip().split(":") domains.append(r_split[0]) random.shuffle(domains) result=dict() result["train"]=domains[:23600] result["pred"]=domains[23600:29500] with open("../result_data/yd_nf_data.json","w") as f: f.write(json.dumps(result)) print(len(domains))
def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
def filter2LDAleax(): psl=PublicSuffixList() data=[] with open("../data_sets/Aleax","r") as f: for r in f: d=r.strip() d1=d[:d.rindex(psl.publicsuffix(d))-1] if len(d1)==0: continue d_split=d1.split(".") if len(d_split)==1 and len(d_split[0])!=0: data.append(d) print(len(data)) with open("../data_sets/Aleax2LD","w") as f: f.write("\n".join(data))
def getAllDomainLabels(self, domains): labels = [] index = [] psl = PublicSuffixList() for i in range(len(domains)): d = domains[i].strip() pub = psl.publicsuffix(d) d_split = d[:d.rindex(pub) - 1].split(".") if len(d_split) > 2: print("d:{} pub:{}".format(d, pub)) for l in d_split: if len(l) == 0: print("kong kong") labels.append(l) index.append(i) return labels, index
def check_hsts_preload(url: str) -> List[dict]: hsts_service = "https://hstspreload.com/api/v1/status/" results: List[dict] = [] domain = utils.get_domain(url) if not checkers.is_ip_address(domain): while domain.count(".") > 0: # get the HSTS preload status for the domain res, _ = network.http_json(f"{hsts_service}{domain}") results.append(res) domain = domain.split(".", 1)[-1] if PublicSuffixList().is_public(domain): break return results
def eventdb_apply(host, port, database, username, password, table, dry_run, where, filename): if password: password = input('Password for user %r on %r: ' % (username, host)) where = 'AND ' + where if where else '' con1 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) cur1 = con1.cursor(cursor_factory=DictCursor) con2 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) con2.autocommit = True cur2 = con2.cursor(cursor_factory=DictCursor) cur1.execute(''' SELECT id, "source.fqdn", "destination.fqdn" FROM {table} WHERE ("source.fqdn" IS NOT NULL OR "destination.fqdn" IS NOT NULL) {where} '''.format(table=table, where=where)) psl = PublicSuffixList(only_icann=True) counter = 0 for row in cur1: counter += 1 if row['source.fqdn']: cur2.execute( 'update events set "source.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['source.fqdn'].encode('idna').decode()), row['id'])) if row['destination.fqdn']: cur2.execute( 'update events set "destination.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['destination.fqdn'].encode('idna').decode()), row['id'])) con2.commit() print("Changed %d rows" % counter)
def local_malicious_check(AGD_root="/home/public/2019-01-07-dgarchive_full"): psl=PublicSuffixList() filenames=os.listdir(AGD_root) AGD_set=set() for filename in filenames: filepath="{}/{}".format(AGD_root,filename) df=pd.read_csv(filepath,header=None) print(filepath) AGD_set.update(df.iloc[:,0]) print(df.iloc[:2,0]) domains=[] with open("../result_data/all_domain_list.txt","r") as f: for r in f: d=r.strip() if d in AGD_set: domains.append(d) with open("../result_data/all_FQDN_AGD_in_traffic","w") as f: f.write("\n".join(domains))
def get2subdomain(root_dir="/home/public/2019-01-07-dgarchive_full"): result = dict() psl = PublicSuffixList() for filename in os.listdir(root_dir): with open("{}/{}".format(root_dir, filename), "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 2: domains = result.get(filename) if domains is None: domains = set() result[filename] = domains domains.add(d) for k, v in result.items(): print("{} : {}".format(k, len(v))) v_list = list(v) print(v_list[:10])
def check_for_public_suffixes(filename): lines = files[filename] suffix_detected = False psl = None with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) for i, line in enumerate(lines): current_line = line.strip() public_suffix = psl.publicsuffix(current_line) if public_suffix == current_line: print( f"The line number {i+1} contains just a public suffix: {current_line}" ) suffix_detected = True if suffix_detected: print( "At least one valid public suffix found in {!r}, please " "remove it. See https://publicsuffix.org for details on why this " "shouldn't be blocklisted.".format(filename)) sys.exit(1)
def domains_map_features(self, day): filepath = "../result_data/{}/{}_ip_dict.json".format(day, day) domain_set = set() with open(filepath, 'r') as f: ip_dict = json.loads(f.read()) for k, v in ip_dict.items(): for d in v[0]: domain_set.add(d) for d in v[1]: domain_set.add(d) domain_list = list(domain_set) print('domains number:{}'.format(len(domain_list))) psl = PublicSuffixList() domain_features = self.get_features(domain_list, psl) np.save( "../result_data/{}/{}_all_domain_features.npy".format(day, day), domain_features) with open("../result_data/{}/{}_all_domain_list.txt".format(day, day), "w") as f: f.write('\n'.join(domain_list))
def lstm_getSingleFea(d: str): psl = PublicSuffixList() d = d[:d.rindex(psl.publicsuffix(d)) - 1].replace(".", "") vector = np.zeros(64) if (len(d) == 0): return vector cuter = CutWords() # wordlist = cuter.max_forward_cut(d) # wordlist = cuter.max_backward_cut(d) wordlist = cuter.max_biward_cut(d) vi = 63 for i in range(len(wordlist) - 1, -1, -1): vector[vi] = CutWords.order[wordlist[i]] vi = vi - 1 if (vi < 0): break # print(d) # print(vector) return vector
def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list if private_suffix in DISPOSABLE_DOMAINS: return True # Check the explicitly defined/blacklisted domains. return d.engine.scalar( "SELECT EXISTS (SELECT FROM emailblacklist WHERE domain_name = %(domain)s)", domain=private_suffix, )
def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool( self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList()
def createdataset(type="train", AGD_file="../data_sets/split_AGDs", BD_file="../data_sets/split_benign_nx.json", datasetname="nx_train_data"): if type == "train": v_index = 0 else: v_index = 1 psl = PublicSuffixList() with open(AGD_file, "r") as f: AGD_dict = json.loads(f.read()) with open(BD_file, "r") as f: bd_dict = json.loads(f.read()) allAGDs = set() allBDs = set() for k, v in AGD_dict.items(): for d in v[v_index]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allAGDs.add(l) for d in bd_dict[type]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allBDs.add(l) length = len(allAGDs) print(length) allBDs = list(allBDs)[:length] allAGDs = list(allAGDs) alldomains = allAGDs + allBDs alllabels = list(np.ones(length)) + list(np.zeros(length)) allfeatures = extract_all_features(alldomains) np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures) data = dict() data["domains"] = pd.Series(alldomains, dtype='str') data["labels"] = pd.Series(alllabels, dtype='int32') df = pd.DataFrame(data=data) df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
def dga_static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: map = json.loads(f.read()) for k, v in map.items(): for d in v[0]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 for d in v[1]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def static_1_2(root_dir="/home/public/2019-01-07-dgarchive_full"): psl=PublicSuffixList() result=dict() for filename in os.listdir(root_dir): df = pd.read_csv(os.path.join(root_dir,filename),header=None,error_bad_lines=False) domains = result.get(filename) if domains is None: domains = [set(), set()] result[filename] = domains for d in df.iloc[:,0]: pub_d=psl.publicsuffix(d) if d != pub_d: d_split=d[:d.rindex(pub_d)-1].split(".") if len(d_split)==1: result.get(filename)[0].add(d) elif len(d_split)==2: result.get(filename)[1].add(d) else: print("Wow : {}".format(d)) print("{} finish".format(filename)) print("write") with open("../result_data/dga_data.json","w") as f: f.write(json.dumps(result,cls=MyJsonEncoder))