def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
class DomainSuffixExpertBot(Bot): suffixes = {} def init(self): self.field = self.parameters.field if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True) def process(self): event = self.receive_message() for space in ('source', 'destination'): key = '.'.join((space, self.field)) if key not in event: continue event['.'.join( (space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key]) self.send_message(event) self.acknowledge_message()
class Filter(): def __init__(self): self.psl= PublicSuffixList(accept_unknown=False) self.sf = SingleFilter(100000, self.psl) def isValidDomain(self,domain:str): if (self.sf.isValidDomain(domain) and (not self.sf.inWhiteList(domain))): return True else: return False def Two_Three_level_domain(self,domain:str): """ identify a domain :param domain: domain:str :return: bool """ publicsuffix=self.psl.publicsuffix(domain) if publicsuffix==None: return False pre_domain=domain[:domain.rindex(publicsuffix)-1] if len(pre_domain)==0: return False pre_domain_array=pre_domain.split(".") length=len(pre_domain_array) if length==2 or length==1: return True else: return False
def eventdb_apply(host, port, database, username, password, table, dry_run, where, filename): if password: password = input('Password for user %r on %r: ' % (username, host)) where = 'AND ' + where if where else '' con1 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) cur1 = con1.cursor(cursor_factory=DictCursor) con2 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) con2.autocommit = True cur2 = con2.cursor(cursor_factory=DictCursor) cur1.execute(''' SELECT id, "source.fqdn", "destination.fqdn" FROM {table} WHERE ("source.fqdn" IS NOT NULL OR "destination.fqdn" IS NOT NULL) {where} '''.format(table=table, where=where)) psl = PublicSuffixList(only_icann=True) counter = 0 for row in cur1: counter += 1 if row['source.fqdn']: cur2.execute( 'update events set "source.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['source.fqdn'].encode('idna').decode()), row['id'])) if row['destination.fqdn']: cur2.execute( 'update events set "destination.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['destination.fqdn'].encode('idna').decode()), row['id'])) con2.commit() print("Changed %d rows" % counter)
def static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def get_domain_name(url: str) -> str: """ Gets the domain name of a URL, removing the TLD :param url: URL to find domain of """ # Sanitise the URL, removing protocol and directories url = url.split("://")[-1] url = url.split("/")[0] url = url.split(":")[0] # Get the public suffix public_suffix = PublicSuffixList() url_tld = public_suffix.publicsuffix(url) # Else return the last part before the TLD return url[:-len(url_tld) - 1].split(".")[-1]
def createdataset(type="train", AGD_file="../data_sets/split_AGDs", BD_file="../data_sets/split_benign_nx.json", datasetname="nx_train_data"): if type == "train": v_index = 0 else: v_index = 1 psl = PublicSuffixList() with open(AGD_file, "r") as f: AGD_dict = json.loads(f.read()) with open(BD_file, "r") as f: bd_dict = json.loads(f.read()) allAGDs = set() allBDs = set() for k, v in AGD_dict.items(): for d in v[v_index]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allAGDs.add(l) for d in bd_dict[type]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allBDs.add(l) length = len(allAGDs) print(length) allBDs = list(allBDs)[:length] allAGDs = list(allAGDs) alldomains = allAGDs + allBDs alllabels = list(np.ones(length)) + list(np.zeros(length)) allfeatures = extract_all_features(alldomains) np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures) data = dict() data["domains"] = pd.Series(alldomains, dtype='str') data["labels"] = pd.Series(alllabels, dtype='int32') df = pd.DataFrame(data=data) df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
def dga_static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: map = json.loads(f.read()) for k, v in map.items(): for d in v[0]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 for d in v[1]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def getAllDomainLabels(self, domains): labels = [] index = [] psl = PublicSuffixList() for i in range(len(domains)): d = domains[i].strip() pub = psl.publicsuffix(d) d_split = d[:d.rindex(pub) - 1].split(".") if len(d_split) > 2: print("d:{} pub:{}".format(d, pub)) for l in d_split: if len(l) == 0: print("kong kong") labels.append(l) index.append(i) return labels, index
def filter2LDAleax(): psl=PublicSuffixList() data=[] with open("../data_sets/Aleax","r") as f: for r in f: d=r.strip() d1=d[:d.rindex(psl.publicsuffix(d))-1] if len(d1)==0: continue d_split=d1.split(".") if len(d_split)==1 and len(d_split[0])!=0: data.append(d) print(len(data)) with open("../data_sets/Aleax2LD","w") as f: f.write("\n".join(data))
def main(arguments): suffix_detected = False psl = None download_suffixes() with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) with io.open('disposable_email_blacklist.conf', 'r') as deb: for i, line in enumerate(deb): current_line = line.strip() public_suffix = psl.publicsuffix(current_line) if public_suffix == current_line: print(f'The line number {i+1} contains just a public suffix: {current_line}') suffix_detected = True if suffix_detected: print ('At least one valid public suffix found in the blacklist, please remove it. See https://publicsuffix.org for details on why this shouldn\'t be blacklisted.') sys.exit(1)
def get2subdomain(root_dir="/home/public/2019-01-07-dgarchive_full"): result = dict() psl = PublicSuffixList() for filename in os.listdir(root_dir): with open("{}/{}".format(root_dir, filename), "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 2: domains = result.get(filename) if domains is None: domains = set() result[filename] = domains domains.add(d) for k, v in result.items(): print("{} : {}".format(k, len(v))) v_list = list(v) print(v_list[:10])
def lstm_getSingleFea(d: str): psl = PublicSuffixList() d = d[:d.rindex(psl.publicsuffix(d)) - 1].replace(".", "") vector = np.zeros(64) if (len(d) == 0): return vector cuter = CutWords() # wordlist = cuter.max_forward_cut(d) # wordlist = cuter.max_backward_cut(d) wordlist = cuter.max_biward_cut(d) vi = 63 for i in range(len(wordlist) - 1, -1, -1): vector[vi] = CutWords.order[wordlist[i]] vi = vi - 1 if (vi < 0): break # print(d) # print(vector) return vector
def check_for_public_suffixes(filename): lines = files[filename] suffix_detected = False psl = None with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) for i, line in enumerate(lines): current_line = line.strip() public_suffix = psl.publicsuffix(current_line) if public_suffix == current_line: print( f"The line number {i+1} contains just a public suffix: {current_line}" ) suffix_detected = True if suffix_detected: print( "At least one valid public suffix found in {!r}, please " "remove it. See https://publicsuffix.org for details on why this " "shouldn't be blocklisted.".format(filename)) sys.exit(1)
class DomainSuffixExpertBot(Bot): suffixes = {} def init(self): self.field = self.parameters.field if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True) def process(self): event = self.receive_message() for space in ('source', 'destination'): key = '.'.join((space, self.field)) if key not in event: continue event['.'.join((space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key]) self.send_message(event) self.acknowledge_message()
def static_1_2(root_dir="/home/public/2019-01-07-dgarchive_full"): psl=PublicSuffixList() result=dict() for filename in os.listdir(root_dir): df = pd.read_csv(os.path.join(root_dir,filename),header=None,error_bad_lines=False) domains = result.get(filename) if domains is None: domains = [set(), set()] result[filename] = domains for d in df.iloc[:,0]: pub_d=psl.publicsuffix(d) if d != pub_d: d_split=d[:d.rindex(pub_d)-1].split(".") if len(d_split)==1: result.get(filename)[0].add(d) elif len(d_split)==2: result.get(filename)[1].add(d) else: print("Wow : {}".format(d)) print("{} finish".format(filename)) print("write") with open("../result_data/dga_data.json","w") as f: f.write(json.dumps(result,cls=MyJsonEncoder))
import subprocess import requests from publicsuffixlist import PublicSuffixList psl = PublicSuffixList() abusetld = {} req = requests.get("https://raw.githubusercontent.com/iam-py-test/my_filters_001/main/antimalware.txt") lines = req.text.split("\n") for line in lines: if line.startswith("||") or line.startswith("!") or line == "": continue domain = line.split("$")[0] #print("Domain",domain) try: abusetld[psl.publicsuffix(domain)] += 1 except: abusetld[psl.publicsuffix(domain)] = 1 for tld in abusetld: print("Domains for {}: {}".format(tld,abusetld[tld]))
class DomainSuffixExpertBot(ExpertBot): """Extract the domain suffix from a domain and save it in the the domain_suffix field. Requires a local file with valid domain suffixes""" field: str = None suffix_file: str = None # TODO: should be pathlib.Path def init(self): if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True) def process(self): event = self.receive_message() for space in ('source', 'destination'): key = '.'.join((space, self.field)) if key not in event: continue event['.'.join( (space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key]) self.send_message(event) self.acknowledge_message() @staticmethod def check(parameters): if not os.path.exists(parameters.get('suffix_file', '')): return [[ "error", "File given as parameter 'suffix_file' does not exist." ]] try: with codecs.open(parameters['suffix_file'], encoding='UTF-8') as database: PublicSuffixList(source=database, only_icann=True) except Exception as exc: return [["error", "Error reading database: %r." % exc]] @classmethod def run(cls, parsed_args=None): if not parsed_args: parsed_args = cls._create_argparser().parse_args() if parsed_args.update_database: cls.update_database(verbose=parsed_args.verbose) else: super().run(parsed_args=parsed_args) @classmethod def _create_argparser(cls): argparser = super()._create_argparser() argparser.add_argument("--update-database", action='store_true', help='downloads latest database data') argparser.add_argument("--verbose", action='store_true', help='be verbose') return argparser @classmethod def update_database(cls, verbose=False): bots = {} runtime_conf = get_bots_settings() try: for bot in runtime_conf: if runtime_conf[bot]["module"] == __name__: bots[bot] = runtime_conf[bot]["parameters"]["suffix_file"] except KeyError as e: sys.exit( "Database update failed. Your configuration of {0} is missing key {1}." .format(bot, e)) if not bots: if verbose: print( "Database update skipped. No bots of type {0} present in runtime.conf." .format(__name__)) sys.exit(0) # we only need to import now. If there are no asn_lookup bots, this dependency does not need to be installed try: session = create_request_session() url = "https://publicsuffix.org/list/public_suffix_list.dat" if verbose: print("Downloading the latest database update...") response = session.get(url) if not response.ok: sys.exit("Database update failed. Server responded: {0}.\n" "URL: {1}".format(response.status_code, response.url)) except requests.exceptions.RequestException as e: sys.exit("Database update failed. Connection Error: {0}".format(e)) for database_path in set(bots.values()): database_dir = pathlib.Path(database_path).parent database_dir.mkdir(parents=True, exist_ok=True) with open(database_path, "wb") as database: database.write(response.content) if verbose: print("Database updated. Reloading affected bots.") ctl = IntelMQController() for bot in bots.keys(): ctl.bot_reload(bot)
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual( self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual( self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual( self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual( self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_privateparts(self): psl = self.psl self.assertEqual(psl.privateparts("aaa.www.example.com"), ("aaa", "www", "example.com")) def test_noprivateparts(self): psl = self.psl self.assertEqual(psl.privateparts("com"), None) # no private part def test_reconstructparts(self): psl = self.psl self.assertEqual(".".join(psl.privateparts("aaa.www.example.com")), "aaa.www.example.com") def test_subdomain(self): psl = self.psl self.assertEqual(psl.subdomain("aaa.www.example.com", depth=0), "example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=1), "www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=2), "aaa.www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=3), None) # no sufficient depth
def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")
def test_icann(self): psl = PublicSuffixList(only_icann=True) self.assertEqual(psl.publicsuffix("www.example.com"), 'com') self.assertEqual(psl.publicsuffix("example.priv.at"), 'at')
def MY_expirement_process(root_dir="/home/yandingkui/dga_detection/result_data/", m_file="split_AGDs", benign_file="split_benign_ac.json", n=815, m=10, c='entropy'): psl=PublicSuffixList() with open(root_dir + m_file, "r") as f: malicious_data = json.loads(f.read()) with open(root_dir + benign_file, "r") as f: benign_data = json.loads(f.read()) train_domains = [] train_labels = [] pred_domains = [] pred_labels = [] for k, v in malicious_data.items(): for d in v[0]: d_split = d[:d.index(psl.publicsuffix(d)) - 1].split(".") if len(d_split) == 1: train_domains.append(d_split[0]) else: m = 0 lm = None for l in d_split: if len(l) > m: lm = l train_domains.append(lm) train_labels.append(1) for d in v[1]: pred_domains.append(d) pred_labels.append(1) for d in benign_data.get("train"): pri_d=psl.privatesuffix(d) lm=pri_d[:pri_d.index(psl.publicsuffix(pri_d))-1] train_domains.append(lm) train_labels.append(0) for d in benign_data.get("pred"): pred_domains.append(d) pred_labels.append(0) train_features = char_feature.extract_all_features(train_domains) index = list(range(len(train_domains))) random.shuffle(index) real_train_features = [] real_train_labels = [] for i in index: real_train_features.append(train_features[i]) real_train_labels.append(train_labels[i]) # clf = RandomForestClassifier(n_estimators=800, random_state=0) # {'criterion': 'entropy', 'max_features': 14, 'n_estimators': 820, 'random_state': 0} clf = RandomForestClassifier(n_estimators=n, max_features=m, criterion=c, random_state=0) # print("features") # n_es_list=range(750,850,5) # max_fea_list=range(10,30,2) # tuned_parameters = [{'n_estimators':n_es_list , 'random_state': [0],'max_features': max_fea_list,'criterion':["gini","entropy"]}] # clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,scoring='accuracy',n_jobs=30) clf.fit(real_train_features, real_train_labels) # print("best_params:") # print(clf.best_params_) print("Pontus:feature_importance_") im=clf.feature_importances_ feature_items=[] for i in range(len(im)): feature_items.append((i+1,im[i])) feature_items.sort(key=takeSecond,reverse=True) print(feature_items)
class PSLFaup(object): """ Fake Faup Python Library using PSL for Windows support """ def __init__(self): self.decoded = False self.psl = PublicSuffixList() self._url = None self._retval = {} self.ip_as_host = False def _clear(self): self.decoded = False self._url = None self._retval = {} self.ip_as_host = False def decode(self, url) -> None: """ This function creates a dict of all the url fields. :param url: The URL to normalize """ self._clear() if isinstance(url, bytes) and b'//' not in url[:10]: url = b'//' + url elif '//' not in url[:10]: url = '//' + url self._url = urlparse(url) self.ip_as_host = False hostname = _ensure_str(self._url.hostname) try: ipv4_bytes = socket.inet_aton(_ensure_str(hostname)) ipv4 = ipaddress.IPv4Address(ipv4_bytes) self.ip_as_host = ipv4.compressed except (OSError, ValueError): try: addr, _, _ = hostname.partition('%') ipv6 = ipaddress.IPv6Address(addr) self.ip_as_host = ipv6.compressed except ValueError: pass self.decoded = True self._retval = {} @property def url(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") netloc = self.get_host() + ('' if self.get_port() is None else ':{}'.format(self.get_port())) return _ensure_bytes( urlunparse(( self.get_scheme(), netloc, self.get_resource_path(), '', self.get_query_string(), self.get_fragment(), ))) def get_scheme(self): """ Get the scheme of the url given in the decode function :returns: The URL scheme """ if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.scheme) def get_credential(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self._url.password: return _ensure_str(self._url.username) + ':' + _ensure_str( self._url.password) if self._url.username: return _ensure_str(self._url.username) def get_subdomain(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: if self.get_domain() in self.get_host(): return self.get_host().rsplit(self.get_domain(), 1)[0].rstrip('.') or None def get_domain(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: return self.psl.privatesuffix(self.get_host()) def get_domain_without_tld(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_tld() is not None and not self.ip_as_host: return self.get_domain().rsplit(self.get_tld(), 1)[0].rstrip('.') def get_host(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self._url.hostname is None: return None elif self._url.hostname.isascii(): return _ensure_str(self._url.hostname) else: return _ensure_str(idna.encode(self._url.hostname, uts46=True)) def get_unicode_host(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if not self.ip_as_host: return idna.decode(self.get_host(), uts46=True) def get_tld(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: return self.psl.publicsuffix(self.get_host()) def get_port(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return self._url.port def get_resource_path(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.path) def get_query_string(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.query) def get_fragment(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.fragment) def get(self): self._retval["scheme"] = self.get_scheme() self._retval["tld"] = self.get_tld() self._retval["domain"] = self.get_domain() self._retval["domain_without_tld"] = self.get_domain_without_tld() self._retval["subdomain"] = self.get_subdomain() self._retval["host"] = self.get_host() self._retval["port"] = self.get_port() self._retval["resource_path"] = self.get_resource_path() self._retval["query_string"] = self.get_query_string() self._retval["fragment"] = self.get_fragment() self._retval["url"] = self.url return self._retval
def get_suspicious(year, month, day): timestring = "{}{:0>2d}{:0>2d}".format(year, month, day) suspicious_domains_set = set() if os.path.exists("../result_data/{}domains.txt".format(timestring)): with open("../result_data/{}domains.txt".format(timestring), "r") as f: for r in f: suspicious_domains_set.add(r.strip()) check_active_domains(suspicious_domains_set, timestring) else: init_domain_set = set() # get all domains for hour in range(24): file_path = "{}{:0>2d}{:0>2d}{:0>2d}".format(year, month, day, hour) if not os.path.exists("../result_data/{}".format(file_path)): continue with open("../result_data/{}".format(file_path), "r") as f: for r in f: domain = r.strip().split(",")[1] init_domain_set.add(domain) psl = PublicSuffixList() domain_labels = [] labels_labels = [] i = 0 # get labels domains_list = list(init_domain_set) for d in domains_list: s = d[:d.index(psl.publicsuffix(d)) - 1] for l in s.split("."): if len(l) > 0: domain_labels.append(l) labels_labels.append(i) i = i + 1 features_path = "../result_data/{}_features.npy".format(timestring) if os.path.exists(features_path): features = np.load(features_path) else: features = extract_all_features(domain_labels) np.save(features_path, features) # classifier identifies labels clf = joblib.load("../result_data/ac_model.m") pred_labels = clf.predict(features) domain_index = set() for i in range(len(labels_labels)): if pred_labels[i] == 1: domain_index.add(labels_labels[i]) # get suspicious domains for index in domain_index: ps = psl.privatesuffix(domains_list[index]) if ps is None: continue suspicious_domains_set.add(ps) print("{} domains".format(len(suspicious_domains_set))) with open("../result_data/{}domains.txt".format(timestring), "w") as f: f.write("\n".join(suspicious_domains_set)) print("save finish") # dgarchive check check_active_domains(suspicious_domains_set, timestring)
def get_domain_private_suffix(domain): """ returns 'www.google' for 'www.google.com' """ psl = PublicSuffixList() tld = "." + psl.publicsuffix(domain) return domain.replace(tld,'')
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")