def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list disposable_domains = _retrieve_disposable_email_domains() if private_suffix in disposable_domains: return True # Check the explicitly defined/blacklisted domains. blacklisted_domains = d.engine.execute(""" SELECT domain_name FROM emailblacklist """).fetchall() for site in blacklisted_domains: if private_suffix == site['domain_name']: return True # If we get here, the domain (or subdomain) is not blacklisted return False
def _check_same_origin(self, current_url): ''' 检查两个URL是否同源 ''' current_url = to_unicode(current_url) url_part = urlparse.urlparse(current_url) #url_part_list=url_part.netloc.split('.') psl2 = PublicSuffixList() url_origin = psl2.privatesuffix(url_part.netloc) return url_origin == self.origin
def feed_url(self, url): ''' 设置初始爬取URL ''' if isinstance(url, basestring): url = to_unicode(url) url = UrlData(url) if self.same_origin: url_part = urlparse.urlparse(unicode(url)) psl = PublicSuffixList() self.origin = psl.privatesuffix(url_part.netloc) self.fetcher_queue.put(url, block=True)
def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list if private_suffix in DISPOSABLE_DOMAINS: return True # Check the explicitly defined/blacklisted domains. return d.engine.scalar( "SELECT EXISTS (SELECT FROM emailblacklist WHERE domain_name = %(domain)s)", domain=private_suffix, )
def MY_expirement_process(root_dir="/home/yandingkui/dga_detection/result_data/", m_file="split_AGDs", benign_file="split_benign_ac.json", n=815, m=10, c='entropy'): psl=PublicSuffixList() with open(root_dir + m_file, "r") as f: malicious_data = json.loads(f.read()) with open(root_dir + benign_file, "r") as f: benign_data = json.loads(f.read()) train_domains = [] train_labels = [] pred_domains = [] pred_labels = [] for k, v in malicious_data.items(): for d in v[0]: d_split = d[:d.index(psl.publicsuffix(d)) - 1].split(".") if len(d_split) == 1: train_domains.append(d_split[0]) else: m = 0 lm = None for l in d_split: if len(l) > m: lm = l train_domains.append(lm) train_labels.append(1) for d in v[1]: pred_domains.append(d) pred_labels.append(1) for d in benign_data.get("train"): pri_d=psl.privatesuffix(d) lm=pri_d[:pri_d.index(psl.publicsuffix(pri_d))-1] train_domains.append(lm) train_labels.append(0) for d in benign_data.get("pred"): pred_domains.append(d) pred_labels.append(0) train_features = char_feature.extract_all_features(train_domains) index = list(range(len(train_domains))) random.shuffle(index) real_train_features = [] real_train_labels = [] for i in index: real_train_features.append(train_features[i]) real_train_labels.append(train_labels[i]) # clf = RandomForestClassifier(n_estimators=800, random_state=0) # {'criterion': 'entropy', 'max_features': 14, 'n_estimators': 820, 'random_state': 0} clf = RandomForestClassifier(n_estimators=n, max_features=m, criterion=c, random_state=0) # print("features") # n_es_list=range(750,850,5) # max_fea_list=range(10,30,2) # tuned_parameters = [{'n_estimators':n_es_list , 'random_state': [0],'max_features': max_fea_list,'criterion':["gini","entropy"]}] # clf = GridSearchCV(RandomForestClassifier(), tuned_parameters, cv=5,scoring='accuracy',n_jobs=30) clf.fit(real_train_features, real_train_labels) # print("best_params:") # print(clf.best_params_) print("Pontus:feature_importance_") im=clf.feature_importances_ feature_items=[] for i in range(len(im)): feature_items.append((i+1,im[i])) feature_items.sort(key=takeSecond,reverse=True) print(feature_items)
# -*- coding:utf-8 -*- __author__ = '*****@*****.**' from publicsuffixlist import PublicSuffixList cn = set() alexa = set() psl = PublicSuffixList() with open("alexa-top-1m.csv") as f: for i in xrange(0, 1000000): #此处可以控制Alexa排名选择个数 domain = f.readline().strip().split(',')[1] domain_2ld = psl.privatesuffix(domain) if domain_2ld is None: alexa.add(domain) else: alexa.add(domain_2ld) with open("result/chinaz_top_domains.txt") as f: for line in f: domain = line.strip() domain_2ld = psl.privatesuffix(domain) if domain_2ld is None: cn.add(domain) else: cn.add(domain_2ld) with open("whitedomains.txt", 'w') as f: unionset = cn | alexa for domain in unionset: try: f.write(domain.strip() + '\n') except: print "An Error Occured, program continue......" continue
def subresource_integrity(reqs: dict, expectation='sri-implemented-and-external-scripts-loaded-securely') -> dict: """ :param reqs: dictionary containing all the request and response objects :param expectation: test expectation sri-implemented-and-all-scripts-loaded-securely: all same origin, and uses SRI sri-implemented-and-external-scripts-loaded-securely: integrity attribute exists on all external scripts, and scripts loaded [default for HTML] sri-implemented-but-external-scripts-not-loaded-securely: SRI implemented, but with scripts loaded over HTTP sri-not-implemented-but-external-scripts-loaded-securely: SRI isn't implemented, but all scripts are loaded over HTTPS sri-not-implemented-and-external-scripts-not-loaded-securely: SRI isn't implemented, and scripts are downloaded over HTTP sri-not-implemented-but-all-scripts-loaded-from-secure-origin: SRI isn't implemented, but all scripts come from secure origins (self) sri-not-implemented-but-no-scripts-loaded: SRI isn't implemented, because the page doesn't load any scripts sri-not-implemented-response-not-html: SRI isn't needed, because the page isn't HTML [default for non-HTML] request-did-not-return-status-code-200: Only look for SRI on pages that returned 200, not things like 404s html-not-parsable: Can't parse the page's content :return: dictionary with: data: all external scripts and their integrity / crossorigin attributes expectation: test expectation pass: whether the site's external scripts met expectations result: short string describing the result of the test """ output = { 'data': {}, 'expectation': expectation, 'pass': False, 'result': None, } response = reqs['responses']['auto'] # The order of how "good" the results are goodness = ['sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-implemented-but-external-scripts-not-loaded-securely', 'sri-not-implemented-but-external-scripts-loaded-securely', 'sri-not-implemented-and-external-scripts-not-loaded-securely', 'sri-not-implemented-response-not-html'] # If the response to get / fails if response.status_code != 200: output['result'] = 'request-did-not-return-status-code-200' # If the content isn't HTML, there's no scripts to load; this is okay elif response.headers.get('Content-Type', '').split(';')[0] not in ('text/html', 'application/xhtml+xml'): output['result'] = 'sri-not-implemented-response-not-html' else: # Try to parse the HTML try: soup = bs(reqs['resources']['/'], 'html.parser') except: output['result'] = 'html-not-parsable' return output # Track to see if any scripts were on foreign TLDs scripts_on_foreign_origin = False # Get all the scripts scripts = soup.find_all('script') for script in scripts: if script.has_attr('src'): # Script tag parameters src = urlparse(script['src']) integrity = script.get('integrity') crossorigin = script.get('crossorigin') # Check to see if they're on the same second-level domain # TODO: update the PSL list on startup psl = PublicSuffixList() samesld = True if (psl.privatesuffix(urlparse(response.url).netloc) == psl.privatesuffix(src.netloc)) else False # Check to see if it's the same origin or second-level domain if src.netloc == '' or samesld: secureorigin = True elif src.netloc != '' and '.' not in src.netloc: # like localhost secureorigin = False scripts_on_foreign_origin = True else: secureorigin = False scripts_on_foreign_origin = True # See if it's a secure scheme if src.scheme == 'https' or (src.scheme == '' and urlparse(response.url).scheme == 'https'): securescheme = True else: securescheme = False # Add it to the scripts data result, if it's not a relative URI if not secureorigin: output['data'][script['src']] = { 'crossorigin': crossorigin, 'integrity': integrity } if integrity and not securescheme: output['result'] = only_if_worse('sri-implemented-but-external-scripts-not-loaded-securely', output['result'], goodness) elif not integrity and securescheme: output['result'] = only_if_worse('sri-not-implemented-but-external-scripts-loaded-securely', output['result'], goodness) elif not integrity and not securescheme: output['result'] = only_if_worse('sri-not-implemented-and-external-scripts' '-not-loaded-securely', output['result'], goodness) # Grant bonus even if they use SRI on the same origin else: if integrity and securescheme and not output['result']: output['result'] = 'sri-implemented-and-all-scripts-loaded-securely' # If the page doesn't load any scripts if not scripts: output['result'] = 'sri-not-implemented-but-no-scripts-loaded' # If all the scripts are loaded from a secure origin, not triggering a need for SRI elif scripts and not scripts_on_foreign_origin and not output['result']: output['result'] = 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin' # If the page loaded from a foreign origin, but everything included SRI elif scripts and scripts_on_foreign_origin and not output['result']: output['result'] = only_if_worse('sri-implemented-and-external-scripts-loaded-securely', output['result'], goodness) # Code defensively on the size of the data output['data'] = output['data'] if len(str(output['data'])) < 32768 else {} # Check to see if the test passed or failed if output['result'] in ('sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-not-implemented-response-not-html', 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin', 'sri-not-implemented-but-no-scripts-loaded', expectation): output['pass'] = True return output
def subresource_integrity(reqs: dict, expectation='sri-implemented-and-external-scripts-loaded-securely') -> dict: """ :param reqs: dictionary containing all the request and response objects :param expectation: test expectation sri-implemented-and-all-scripts-loaded-securely: all same origin, and uses SRI sri-implemented-and-external-scripts-loaded-securely: integrity attribute exists on all external scripts, and scripts loaded [default for HTML] sri-implemented-but-external-scripts-not-loaded-securely: SRI implemented, but with scripts loaded over HTTP sri-not-implemented-but-external-scripts-loaded-securely: SRI isn't implemented, but all scripts are loaded over HTTPS sri-not-implemented-and-external-scripts-not-loaded-securely: SRI isn't implemented, and scripts are downloaded over HTTP sri-not-implemented-but-all-scripts-loaded-from-secure-origin: SRI isn't implemented, but all scripts come from secure origins (self) sri-not-implemented-but-no-scripts-loaded: SRI isn't implemented, because the page doesn't load any scripts sri-not-implemented-response-not-html: SRI isn't needed, because the page isn't HTML [default for non-HTML] request-did-not-return-status-code-200: Only look for SRI on pages that returned 200, not things like 404s html-not-parsable: Can't parse the page's content :return: dictionary with: data: all external scripts and their integrity / crossorigin attributes expectation: test expectation pass: whether the site's external scripts met expectations result: short string describing the result of the test """ output = { 'data': {}, 'expectation': expectation, 'pass': False, 'result': None, } response = reqs['responses']['auto'] # The order of how "good" the results are goodness = ['sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-implemented-but-external-scripts-not-loaded-securely', 'sri-not-implemented-but-external-scripts-loaded-securely', 'sri-not-implemented-and-external-scripts-not-loaded-securely', 'sri-not-implemented-response-not-html'] # If the content isn't HTML, there's no scripts to load; this is okay if response.headers.get('Content-Type', '').split(';')[0] not in HTML_TYPES: output['result'] = 'sri-not-implemented-response-not-html' else: # Try to parse the HTML try: soup = bs(reqs['resources']['__path__'], 'html.parser') except: output['result'] = 'html-not-parsable' return output # Track to see if any scripts were on foreign TLDs scripts_on_foreign_origin = False # Get all the scripts scripts = soup.find_all('script') for script in scripts: if script.has_attr('src'): # Script tag parameters src = urlparse(script['src']) integrity = script.get('integrity') crossorigin = script.get('crossorigin') # Check to see if they're on the same second-level domain # TODO: update the PSL list on startup psl = PublicSuffixList() samesld = True if (psl.privatesuffix(urlparse(response.url).netloc) == psl.privatesuffix(src.netloc)) else False if src.scheme == '': if src.netloc == '': # Relative URL (src="/path") relativeorigin = True relativeprotocol = False else: # Relative protocol (src="//host/path") relativeorigin = False relativeprotocol = True else: relativeorigin = False relativeprotocol = False # Check to see if it's the same origin or second-level domain if relativeorigin or (samesld and not relativeprotocol): secureorigin = True else: secureorigin = False scripts_on_foreign_origin = True # See if it's a secure scheme if src.scheme == 'https' or (relativeorigin and urlparse(response.url).scheme == 'https'): securescheme = True else: securescheme = False # Add it to the scripts data result, if it's not a relative URI if not secureorigin: output['data'][script['src']] = { 'crossorigin': crossorigin, 'integrity': integrity } if integrity and not securescheme: output['result'] = only_if_worse('sri-implemented-but-external-scripts-not-loaded-securely', output['result'], goodness) elif not integrity and securescheme: output['result'] = only_if_worse('sri-not-implemented-but-external-scripts-loaded-securely', output['result'], goodness) elif not integrity and not securescheme and samesld: output['result'] = only_if_worse('sri-not-implemented-and-external-scripts' '-not-loaded-securely', output['result'], goodness) elif not integrity and not securescheme: output['result'] = only_if_worse('sri-not-implemented-and-external-scripts' '-not-loaded-securely', output['result'], goodness) # Grant bonus even if they use SRI on the same origin else: if integrity and securescheme and not output['result']: output['result'] = 'sri-implemented-and-all-scripts-loaded-securely' # If the page doesn't load any scripts if not scripts: output['result'] = 'sri-not-implemented-but-no-scripts-loaded' # If all the scripts are loaded from a secure origin, not triggering a need for SRI elif scripts and not scripts_on_foreign_origin and not output['result']: output['result'] = 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin' # If the page loaded from a foreign origin, but everything included SRI elif scripts and scripts_on_foreign_origin and not output['result']: output['result'] = only_if_worse('sri-implemented-and-external-scripts-loaded-securely', output['result'], goodness) # Code defensively on the size of the data output['data'] = output['data'] if len(str(output['data'])) < 32768 else {} # Check to see if the test passed or failed if output['result'] in ('sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-not-implemented-response-not-html', 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin', 'sri-not-implemented-but-no-scripts-loaded', expectation): output['pass'] = True return output
def scan(session: Session): reporter.register_data("url", session.url) reporter.register_data("domain", session.domain) # check to see if this is an IP, if so, bail out if utils.is_ip(session.domain): return output.empty() output.norm("DNS Information:") # get the root domain, by looking up via the PSL psl = PublicSuffixList() root_domain = psl.privatesuffix(session.domain) reporter.register_data("root_domain", root_domain) # IP Addresses for the domain we are scanning ips = basic.get_ips(session.domain) reporter.register_data("ip", ips) for ip in ips: output.norm("\t%s (%s)" % (ip, basic.get_host(str(ip)))) addr = ipaddress.ip_address(str(ip)) if not addr.is_private: ni = network_info.network_info(str(ip)) output.norm("\t\t%s" % ni) if addr.version == 4: output.norm("\t\thttps://www.shodan.io/host/%s" % ip) output.norm("\t\thttps://censys.io/ipv4/%s" % ip) else: output.norm("\t\thttps://www.shodan.io/host/%s" % str(ip).lower()) output.empty() # TXT records for the domain we are scanning try: txt = basic.get_text(session.domain) reporter.register_data("dns_txt", {session.domain: txt}) for rec in txt: output.norm("\tTXT: %s" % rec) except Exception as err: output.error(f"Error getting TXT records: {str(err)}") # TXT records for the root domain try: if root_domain != session.domain: txt = basic.get_text(root_domain) reporter.register_data("dns_txt", {root_domain: txt}) for rec in txt: output.norm("\tTXT (%s): %s" % (root_domain, rec)) except Exception as err: output.error(f"Error getting TXT (root) records: {str(err)}") output.empty() # MX records for the domain we are scanning try: mx = basic.get_mx(session.domain) reporter.register_data("dns_mx", {session.domain: mx}) for rec in mx: server_ip, ni = _get_ip_info(rec[0]) info = "%s (%s) - %s (%s)" % (rec[0], rec[1], server_ip, ni) output.norm("\tMX: %s" % info) except Exception as err: output.error(f"Error getting MX records: {str(err)}") try: # MX records for the root domain if root_domain != session.domain: mx = basic.get_mx(root_domain) reporter.register_data("dns_mx", {root_domain: mx}) for rec in mx: server_ip, ni = _get_ip_info(rec[0]) info = "%s (%s) - %s (%s)" % (rec[0], rec[1], server_ip, ni) output.norm("\tMX (%s): %s" % (root_domain, info)) except Exception as err: output.error(f"Error getting MX (root) records: {str(err)}") output.empty() # NS records for the root domain try: ns = basic.get_ns(root_domain) reporter.register_data("dns_ns", {root_domain: ns}) for rec in ns: server_ip, ni = _get_ip_info(rec) info = "%s - %s (%s)" % (rec, server_ip, ni) output.norm("\tNS: %s" % info) except Exception as err: output.error(f"Error getting NS records: {str(err)}") output.empty() if session.args.srv: try: output.norm( "Searching for SRV records, this will take a minute...") output.empty() with Spinner(): srv_records = srv.find_srv_records(root_domain) reporter.register_data("dns_srv", srv_records) for rec in srv_records: server_ip, ni = _get_ip_info(rec[1]) info = "%s: %s:%s - %s (%s)" % (rec[0], rec[1], rec[2], server_ip, ni) output.norm("\tSRV: %s" % info) output.empty() except Exception as err: output.error(f"Error getting SRV records: {str(err)}") if session.args.subdomains: try: output.norm( "Searching for sub-domains, this will take a few minutes...") output.empty() with Spinner(): sds = subdomains.find_subdomains(root_domain) reporter.register_data("dns_subdomains", sds) for rec in sds: info = "" if rec[0] == "CNAME": server_ip, ni = _get_ip_info(rec[2]) info = "(CNAME) %s -> %s - %s (%s)" % ( rec[1], rec[2], server_ip, ni, ) elif rec[0] == "A": ni = network_info.network_info(rec[2]) info = "(A) %s: %s (%s)" % (rec[1], rec[2], ni) elif rec[0] == "AAAA": ni = network_info.network_info(rec[2]) info = "(AAAA) %s: %s (%s)" % (rec[1], rec[2], ni) output.norm("\tSubdomain: %s" % info) except Exception as err: output.error(f"Error getting subdomain records: {str(err)}") output.empty() try: caa_count = 0 carec = caa.get_caa(session.domain) reporter.register_data("dns_caa", carec) for rec in carec: curr = rec[0] if rec[1] == "CNAME": output.norm("\tCAA (%s): CNAME Found: -> %s" % (curr, rec[2])) elif rec[1] == "CAA": if len(rec[2]) > 0: for line in rec[2]: output.norm('\tCAA (%s): "%s"' % (curr, line)) caa_count += 1 else: output.norm("\tCAA (%s): No Records Found" % curr) # notify the user if there's an issue if caa_count == 0: reporter.display( "\tCAA: Domain does not have protection from CAA", issue.Issue(Vulnerabilities.DNS_CAA_MISSING, session.url, {"caa_records": carec}), ) except Exception as err: output.error(f"Error getting CAA records: {str(err)}") output.empty() try: dk = dnssec.get_dnskey(session.domain) reporter.register_data("dns_dnskey", dk) if len(dk) > 0: for rec in dk: output.norm( "\tDNSKEY: Algorithm: '%s' - Flags: '%s' - Key Length: %s" % (rec[2], rec[0], len(rec[3]) * 8)) else: reporter.display( "\tDNSKEY: Domain does not use DNSSEC", issue.Issue(Vulnerabilities.DNS_DNSSEC_NOT_ENABLED, session.url, {}), ) except Exception as err: output.error(f"Error getting DNSKEY records: {str(err)}") output.empty()
class PSLFaup(object): """ Fake Faup Python Library using PSL for Windows support """ def __init__(self): self.decoded = False self.psl = PublicSuffixList() self._url = None self._retval = {} self.ip_as_host = False def _clear(self): self.decoded = False self._url = None self._retval = {} self.ip_as_host = False def decode(self, url) -> None: """ This function creates a dict of all the url fields. :param url: The URL to normalize """ self._clear() if isinstance(url, bytes) and b'//' not in url[:10]: url = b'//' + url elif '//' not in url[:10]: url = '//' + url self._url = urlparse(url) self.ip_as_host = False hostname = _ensure_str(self._url.hostname) try: ipv4_bytes = socket.inet_aton(_ensure_str(hostname)) ipv4 = ipaddress.IPv4Address(ipv4_bytes) self.ip_as_host = ipv4.compressed except (OSError, ValueError): try: addr, _, _ = hostname.partition('%') ipv6 = ipaddress.IPv6Address(addr) self.ip_as_host = ipv6.compressed except ValueError: pass self.decoded = True self._retval = {} @property def url(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") netloc = self.get_host() + ('' if self.get_port() is None else ':{}'.format(self.get_port())) return _ensure_bytes( urlunparse(( self.get_scheme(), netloc, self.get_resource_path(), '', self.get_query_string(), self.get_fragment(), ))) def get_scheme(self): """ Get the scheme of the url given in the decode function :returns: The URL scheme """ if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.scheme) def get_credential(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self._url.password: return _ensure_str(self._url.username) + ':' + _ensure_str( self._url.password) if self._url.username: return _ensure_str(self._url.username) def get_subdomain(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: if self.get_domain() in self.get_host(): return self.get_host().rsplit(self.get_domain(), 1)[0].rstrip('.') or None def get_domain(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: return self.psl.privatesuffix(self.get_host()) def get_domain_without_tld(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_tld() is not None and not self.ip_as_host: return self.get_domain().rsplit(self.get_tld(), 1)[0].rstrip('.') def get_host(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self._url.hostname is None: return None elif self._url.hostname.isascii(): return _ensure_str(self._url.hostname) else: return _ensure_str(idna.encode(self._url.hostname, uts46=True)) def get_unicode_host(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if not self.ip_as_host: return idna.decode(self.get_host(), uts46=True) def get_tld(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") if self.get_host() is not None and not self.ip_as_host: return self.psl.publicsuffix(self.get_host()) def get_port(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return self._url.port def get_resource_path(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.path) def get_query_string(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.query) def get_fragment(self): if not self.decoded: raise UrlNotDecoded("You must call faup.decode() first") return _ensure_str(self._url.fragment) def get(self): self._retval["scheme"] = self.get_scheme() self._retval["tld"] = self.get_tld() self._retval["domain"] = self.get_domain() self._retval["domain_without_tld"] = self.get_domain_without_tld() self._retval["subdomain"] = self.get_subdomain() self._retval["host"] = self.get_host() self._retval["port"] = self.get_port() self._retval["resource_path"] = self.get_resource_path() self._retval["query_string"] = self.get_query_string() self._retval["fragment"] = self.get_fragment() self._retval["url"] = self.url return self._retval
from publicsuffixlist import PublicSuffixList import os psl = PublicSuffixList() filter_file = os.path.join(adutil.project_root, 'filters', 'mono.yml') with open(filter_file) as f: saved_data = yaml.safe_load(f) known_domains = set(saved_data['domains']) vault_reply = requests.get( 'https://otx.alienvault.com/otxapi/indicator/IPv4/passive_dns/134.209.136.68' ).json() for entry in vault_reply['passive_dns']: hostname = psl.privatesuffix(entry['hostname']) known_domains.add(hostname) known_domains = sorted(known_domains) extra_hosts = [] for domain in known_domains: for i in range(10): extra_hosts.append('%d.%s' % (i, domain)) with open(filter_file, 'w') as f: f.write("# Don't bother manually updating this file.\n") f.write( "# It is automatically updated with the tools/update-mono-list.py script.\n" ) yaml.dump({'domains': known_domains, 'extra_hosts': extra_hosts}, f)
class gdyd(): def __init__(self): self.psl = PublicSuffixList(accept_unknown=False) self.filter = Filter() def statistic_single_hour(self, hour_dir, day, hour: int): counter = Counter() for minute_file in os.listdir(hour_dir): bzfile = os.path.join(hour_dir, minute_file) try: file_point = bz2.open(bzfile, 'r') for line in file_point: try: line = line.decode().strip() linesplit = line.split(',') querydomain = linesplit[3].strip().lower() if self.filter.isValidDomain(querydomain): prisuf = self.psl.privatesuffix(querydomain) if prisuf is not None and prisuf not in self.filter.sf.AleaxTop and \ prisuf not in self.filter.sf.CDNSet and \ prisuf not in self.filter.sf.commonset: counter[prisuf] += 1 if prisuf != querydomain: front = querydomain[:querydomain. rindex(prisuf) - 1] front_s = front.rsplit(".", 1) if len(front_s) != 0: ThreeLD = "{}.{}".format( front_s[len(front_s) - 1], prisuf) counter[ThreeLD] += 1 except: pass file_point.close() except: print("error : {}".format(bzfile)) print("{} finish".format(bzfile)) print("{}{} write".format(day, hour)) with open("../result_data/temp/{}{}.json".format(day, hour), "w") as f: f.write(json.dumps(counter)) def all_day_counter( self, rootpath="/home/public/DNS_Project/pdns_gddx_compressed/gdyd", days=["20180502", "20180503", "20180504"]): s = time.time() number = 24 pool = Pool(number) # result = [] for day in days: daydir = os.path.join(rootpath, "dt={}".format(day)) for h in range(24): hourdir = os.path.join(daydir, "hour={0:02d}".format(h)) if os.path.exists(hourdir): pool.apply_async(func=self.statistic_single_hour, args=( hourdir, day, h, )) else: print("path error") # result.append(r) pool.close() pool.join() # whole_counter = Counter() # for r in result: # whole_counter.update(r.get()) # for r in whole_counter.most_common(30000): # print("{},{}".format(r[0], r[1])) e = time.time() print("spend time :{} minutes".format((e - s) / 60)) def get_counter(self, days=["20180502", "20180503", "20180504"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: counter = Counter() for i in range(24): path = os.path.join(root_dir, "{}{}.json".format(day, i)) if os.path.exists(path): with open(path, "r") as f: counter1 = Counter(json.loads(f.read())) counter.update(counter1) with open("{}{}.json".format(root_dir, day), "w") as f: f.write(json.dumps(counter)) def remove_file( self, days=["20180427", "20180428", "20180429", "20180430", "20180501"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: for i in range(24): path = os.path.join(root_dir, "{}{}.json".format(day, i)) if os.path.exists(path): os.remove(path) def getBenignDomains(self, days=["20180502", "20180503"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: with open(os.path.join(root_dir, "{}.json".format(day)), "r") as f: counter = Counter(json.loads(f.read())) data = [] for item in counter.most_common(30000): data.append(item[0]) with open("../data_sets/yd_{}".format(day), "w") as F: F.write("\n".join(data)) def dxvsyd(self, days=["20180427", "20171031"]): yd = "/home/yandingkui/Pontus/result_data/temp/20180427.json" dx = "/home/yandingkui/Pontus/result_data/gddx/20171031.json" with open(yd, "r") as f: counter1 = Counter(json.loads(f.read())) with open(dx, "r") as f: counter2 = Counter(json.loads(f.read())) s1 = [] s2 = [] for item in counter1.most_common(30000): s1.append(item[0]) for item in counter2.most_common(30000): s2.append(item[0]) with open("../result_data/yd_20180427", "w") as f: f.write("\n".join(s1)) with open("../result_data/dx_20171031", "w") as f: f.write("\n".join(s2))
def get_suspicious(year, month, day): timestring = "{}{:0>2d}{:0>2d}".format(year, month, day) suspicious_domains_set = set() if os.path.exists("../result_data/{}domains.txt".format(timestring)): with open("../result_data/{}domains.txt".format(timestring), "r") as f: for r in f: suspicious_domains_set.add(r.strip()) check_active_domains(suspicious_domains_set, timestring) else: init_domain_set = set() # get all domains for hour in range(24): file_path = "{}{:0>2d}{:0>2d}{:0>2d}".format(year, month, day, hour) if not os.path.exists("../result_data/{}".format(file_path)): continue with open("../result_data/{}".format(file_path), "r") as f: for r in f: domain = r.strip().split(",")[1] init_domain_set.add(domain) psl = PublicSuffixList() domain_labels = [] labels_labels = [] i = 0 # get labels domains_list = list(init_domain_set) for d in domains_list: s = d[:d.index(psl.publicsuffix(d)) - 1] for l in s.split("."): if len(l) > 0: domain_labels.append(l) labels_labels.append(i) i = i + 1 features_path = "../result_data/{}_features.npy".format(timestring) if os.path.exists(features_path): features = np.load(features_path) else: features = extract_all_features(domain_labels) np.save(features_path, features) # classifier identifies labels clf = joblib.load("../result_data/ac_model.m") pred_labels = clf.predict(features) domain_index = set() for i in range(len(labels_labels)): if pred_labels[i] == 1: domain_index.add(labels_labels[i]) # get suspicious domains for index in domain_index: ps = psl.privatesuffix(domains_list[index]) if ps is None: continue suspicious_domains_set.add(ps) print("{} domains".format(len(suspicious_domains_set))) with open("../result_data/{}domains.txt".format(timestring), "w") as f: f.write("\n".join(suspicious_domains_set)) print("save finish") # dgarchive check check_active_domains(suspicious_domains_set, timestring)
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual( self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual( self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual( self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual( self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_privateparts(self): psl = self.psl self.assertEqual(psl.privateparts("aaa.www.example.com"), ("aaa", "www", "example.com")) def test_noprivateparts(self): psl = self.psl self.assertEqual(psl.privateparts("com"), None) # no private part def test_reconstructparts(self): psl = self.psl self.assertEqual(".".join(psl.privateparts("aaa.www.example.com")), "aaa.www.example.com") def test_subdomain(self): psl = self.psl self.assertEqual(psl.subdomain("aaa.www.example.com", depth=0), "example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=1), "www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=2), "aaa.www.example.com") self.assertEqual(psl.subdomain("aaa.www.example.com", depth=3), None) # no sufficient depth
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")