def init(self): if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True)
def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None)
def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list disposable_domains = _retrieve_disposable_email_domains() if private_suffix in disposable_domains: return True # Check the explicitly defined/blacklisted domains. blacklisted_domains = d.engine.execute(""" SELECT domain_name FROM emailblacklist """).fetchall() for site in blacklisted_domains: if private_suffix == site['domain_name']: return True # If we get here, the domain (or subdomain) is not blacklisted return False
def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "")
def reduce_domain(domain_in): if not PublicSuffixList().publicsuffix(domain_in, accept_unknown=False): return None domain = PublicSuffixList().privatesuffix(domain_in) if domain: domain = domain.lower() else: log.debug("No eTLD for {}".format(domain)) log.debug("Trimmed domain from {0} to {1}".format(domain_in, domain)) return domain
def _check_same_origin(self, current_url): ''' 检查两个URL是否同源 ''' current_url = to_unicode(current_url) url_part = urlparse.urlparse(current_url) #url_part_list=url_part.netloc.split('.') psl2 = PublicSuffixList() url_origin = psl2.privatesuffix(url_part.netloc) return url_origin == self.origin
def get_whois(self, name): try: domain = urlparse.urlparse(self.target).netloc # if domain is ip,stop querying domain. result1 = re.search("\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", domain) if result1: return # remove port result2 = re.search("\:\d{1,5}$", domain) if result2: domain = domain.split(":")[0] # get domain's ip try: ip = socket.gethostbyname(domain) self.cms_list.add("IP:" + ip) except Exception, e: # print e pass # if re.match("^www\.",domain): # domain = domain.strip("www.") psl = PublicSuffixList() domain = psl.suffix(domain) who = pythonwhois.get_whois(domain) # get whois if who["contacts"]["registrant"]["name"] is not None: self.cms_list.add( "Domain_User:"******"contacts"]["registrant"]["name"].encode("utf8")) if who["contacts"]["registrant"]["email"] is not None: self.cms_list.add( "Domain_Email:" + who["contacts"]["registrant"]["email"].encode("utf8")) if who["contacts"]["registrant"]["phone"] is not None: self.cms_list.add( "Domain_Phone:" + who["contacts"]["registrant"]["phone"].encode("utf8")) if who["registrar"] is not None: self.cms_list.add("Domain_Registrar:" + who["registrar"][0].encode("utf8")) if who["nameservers"] is not None: name_servers = [] for i in who["nameservers"]: name_servers.append(i.encode('UTF8')) self.cms_list.add("Domai_name_servers:" + str(name_servers).encode("utf8"))
def static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def feed_url(self, url): ''' 设置初始爬取URL ''' if isinstance(url, basestring): url = to_unicode(url) url = UrlData(url) if self.same_origin: url_part = urlparse.urlparse(unicode(url)) psl = PublicSuffixList() self.origin = psl.privatesuffix(url_part.netloc) self.fetcher_queue.put(url, block=True)
def get_domain_name(url: str) -> str: """ Gets the domain name of a URL, removing the TLD :param url: URL to find domain of """ # Sanitise the URL, removing protocol and directories url = url.split("://")[-1] url = url.split("/")[0] url = url.split(":")[0] # Get the public suffix public_suffix = PublicSuffixList() url_tld = public_suffix.publicsuffix(url) # Else return the last part before the TLD return url[:-len(url_tld) - 1].split(".")[-1]
def check_for_third_level_domains(filename): with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) invalid = { line for line in files[filename] if len(psl.privateparts(line.strip())) > 1 } if invalid: print( "The following domains contain a third or lower level domain in {!r}:" .format(filename)) for line in sorted(invalid): print("* {}".format(line)) sys.exit(1)
def filter2LDAleax(): psl=PublicSuffixList() data=[] with open("../data_sets/Aleax","r") as f: for r in f: d=r.strip() d1=d[:d.rindex(psl.publicsuffix(d))-1] if len(d1)==0: continue d_split=d1.split(".") if len(d_split)==1 and len(d_split[0])!=0: data.append(d) print(len(data)) with open("../data_sets/Aleax2LD","w") as f: f.write("\n".join(data))
def updatePSL(psl_file=PSLFILE): """ Updates a local copy of PSL file :param psl_file: path for the file to store the list. Default: PSLFILE """ if requests is None: raise Exception("Please install python-requests http(s) library. $ sudo pip install requests") r = requests.get(PSLURL) if r.status_code != requests.codes.ok or len(r.content) == 0: raise Exception("Could not download PSL from " + PSLURL) lastmod = r.headers.get("last-modified", None) f = open(psl_file + ".swp", "wb") f.write(r.content) f.close() with open(psl_file + ".swp", "rb") as f: psl = PublicSuffixList(f) os.rename(psl_file + ".swp", psl_file) if lastmod: t = time.mktime(parsedate(lastmod)) os.utime(psl_file, (t, t)) print("PSL updated") if lastmod: print("last-modified: " + lastmod)
def getAllDomainLabels(self, domains): labels = [] index = [] psl = PublicSuffixList() for i in range(len(domains)): d = domains[i].strip() pub = psl.publicsuffix(d) d_split = d[:d.rindex(pub) - 1].split(".") if len(d_split) > 2: print("d:{} pub:{}".format(d, pub)) for l in d_split: if len(l) == 0: print("kong kong") labels.append(l) index.append(i) return labels, index
def get_tld_esld(PSL, DOMAIN): # Outputs the pairs (TopLevelDomain, EffectiveSecondLevelDomain) for a # given domain (string) provided in input. if not isinstance(DOMAIN, string_types): tld, esld = None, None else: # remove '.' characters while DOMAIN.endswith("."): DOMAIN = DOMAIN[:-1] while DOMAIN.startswith("."): DOMAIN = DOMAIN[1:] if len(DOMAIN) == 0: tld, esld = None, None else: try: # information about TLDs tld = PSL.publicsuffix(DOMAIN) except Exception: tld = PublicSuffixList().publicsuffix(DOMAIN) if tld is None: esld = None else: if tld == DOMAIN: esld = tld else: # we obtain the ESLD by removing the TLD from 'DOMAIN' udn = DOMAIN[:-len(tld) - 1] # find the rightmost '.' and extract the ESLD i = udn.rfind(".") esld = udn[i + 1:] + '.' + tld return tld, esld
def load_psl(): global psl # Fetch PublicSuffix list and load it # if not psl: print('Loading Public Suffix List') psl_file = pfetch() psl = PublicSuffixList(psl_file)
def main(arguments): suffix_detected = False psl = None download_suffixes() with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) with io.open('disposable_email_blacklist.conf', 'r') as deb: for i, line in enumerate(deb): current_line = line.strip() public_suffix = psl.publicsuffix(current_line) if public_suffix == current_line: print(f'The line number {i+1} contains just a public suffix: {current_line}') suffix_detected = True if suffix_detected: print ('At least one valid public suffix found in the blacklist, please remove it. See https://publicsuffix.org for details on why this shouldn\'t be blacklisted.') sys.exit(1)
class DomainSuffixExpertBot(Bot): suffixes = {} def init(self): self.field = self.parameters.field if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True) def process(self): event = self.receive_message() for space in ('source', 'destination'): key = '.'.join((space, self.field)) if key not in event: continue event['.'.join( (space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key]) self.send_message(event) self.acknowledge_message()
class Filter(): def __init__(self): self.psl= PublicSuffixList(accept_unknown=False) self.sf = SingleFilter(100000, self.psl) def isValidDomain(self,domain:str): if (self.sf.isValidDomain(domain) and (not self.sf.inWhiteList(domain))): return True else: return False def Two_Three_level_domain(self,domain:str): """ identify a domain :param domain: domain:str :return: bool """ publicsuffix=self.psl.publicsuffix(domain) if publicsuffix==None: return False pre_domain=domain[:domain.rindex(publicsuffix)-1] if len(pre_domain)==0: return False pre_domain_array=pre_domain.split(".") length=len(pre_domain_array) if length==2 or length==1: return True else: return False
def eventdb_apply(host, port, database, username, password, table, dry_run, where, filename): if password: password = input('Password for user %r on %r: ' % (username, host)) where = 'AND ' + where if where else '' con1 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) cur1 = con1.cursor(cursor_factory=DictCursor) con2 = psycopg2.connect(user=username, password=password, database=database, host=host, port=port) con2.autocommit = True cur2 = con2.cursor(cursor_factory=DictCursor) cur1.execute(''' SELECT id, "source.fqdn", "destination.fqdn" FROM {table} WHERE ("source.fqdn" IS NOT NULL OR "destination.fqdn" IS NOT NULL) {where} '''.format(table=table, where=where)) psl = PublicSuffixList(only_icann=True) counter = 0 for row in cur1: counter += 1 if row['source.fqdn']: cur2.execute( 'update events set "source.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['source.fqdn'].encode('idna').decode()), row['id'])) if row['destination.fqdn']: cur2.execute( 'update events set "destination.domain_suffix" = %s where id = %s', (psl.publicsuffix( row['destination.fqdn'].encode('idna').decode()), row['id'])) con2.commit() print("Changed %d rows" % counter)
def get2subdomain(root_dir="/home/public/2019-01-07-dgarchive_full"): result = dict() psl = PublicSuffixList() for filename in os.listdir(root_dir): with open("{}/{}".format(root_dir, filename), "r") as f: for r in f: d = r.strip().split(",")[0] d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 2: domains = result.get(filename) if domains is None: domains = set() result[filename] = domains domains.add(d) for k, v in result.items(): print("{} : {}".format(k, len(v))) v_list = list(v) print(v_list[:10])
def decompose_filter(inputstring, psl=PublicSuffixList()): logging.debug(f'Parsing "{inputstring}"') try: match_list = [] querystring = inputstring # Clean input querystring = re.sub(r'(?i)[^-a-z0-9.%_]', '', querystring).strip('. ').lower() logging.debug(f'Cleaned input to "{querystring}"') if '_' in querystring: logging.error( f'Single character wildcards are not handled yet. "{querystring}"' ) if querystring.count('%') == 0: ts_q1 = querystring ts_q2 = querystring else: # Check for usable strings at the start of the string leading_match = re.search( r'^(?P<q_lead>[-a-z0-9.]+)(?:[%_.]*[%_])', querystring) if leading_match: match_list.append(leading_match.group('q_lead') + ':*') # Check for usable strings in the middle of the string mid_match_list = re.findall( r'(?<=[%_]\.)(?P<q_mid>[-a-z0-9.]+)(?:[%_.]*[%_])', querystring) if mid_match_list: mid_match_list = [m + ':*' for m in mid_match_list] match_list.extend(mid_match_list) # Check for usable strings at the end of the string trailing_match = re.search( r'(?<=[%_]\.)(?P<q_trail>[-a-z0-9.]+[-a-z0-9])$', querystring) if trailing_match: if psl.is_private(trailing_match.group('q_trail')): match_list.append(trailing_match.group('q_trail')) if match_list: match_list = list(set(match_list)) match_list.sort(key=lambda x: len(x.lstrip('w').rstrip(':*')), reverse=True) ts_long_list = match_list[:2] ts_q1 = ts_long_list[0] ts_q2 = ts_long_list[-1] else: logging.error( f'Could not extract usable querystring on "{inputstring}"') return except Exception as e: logging.error(f'Error on "{inputstring}", "{e}"') return return_dict = { 'querystring': querystring, 'ts_q1': ts_q1, 'ts_q2': ts_q2, } return return_dict
def check_for_public_suffixes(filename): lines = files[filename] suffix_detected = False psl = None with open("public_suffix_list.dat", "r") as latest: psl = PublicSuffixList(latest) for i, line in enumerate(lines): current_line = line.strip() public_suffix = psl.publicsuffix(current_line) if public_suffix == current_line: print( f"The line number {i+1} contains just a public suffix: {current_line}" ) suffix_detected = True if suffix_detected: print( "At least one valid public suffix found in {!r}, please " "remove it. See https://publicsuffix.org for details on why this " "shouldn't be blocklisted.".format(filename)) sys.exit(1)
def lstm_getSingleFea(d: str): psl = PublicSuffixList() d = d[:d.rindex(psl.publicsuffix(d)) - 1].replace(".", "") vector = np.zeros(64) if (len(d) == 0): return vector cuter = CutWords() # wordlist = cuter.max_forward_cut(d) # wordlist = cuter.max_backward_cut(d) wordlist = cuter.max_biward_cut(d) vi = 63 for i in range(len(wordlist) - 1, -1, -1): vector[vi] = CutWords.order[wordlist[i]] vi = vi - 1 if (vi < 0): break # print(d) # print(vector) return vector
def check(parameters): if not os.path.exists(parameters.get('suffix_file', '')): return [[ "error", "File given as parameter 'suffix_file' does not exist." ]] try: with codecs.open(parameters['suffix_file'], encoding='UTF-8') as database: PublicSuffixList(source=database, only_icann=True) except Exception as exc: return [["error", "Error reading database: %r." % exc]]
def is_email_blacklisted(address): """ Determines if a supplied email address is present in the 'emailblacklist' table. Parameters: address: The email address to split out the domain from. Returns: Boolean True if present on the blacklist, or False otherwise. """ _, domain = address.rsplit("@", 1) psl = PublicSuffixList() private_suffix = psl.privatesuffix(domain=domain) # Check the disposable email address list if private_suffix in DISPOSABLE_DOMAINS: return True # Check the explicitly defined/blacklisted domains. return d.engine.scalar( "SELECT EXISTS (SELECT FROM emailblacklist WHERE domain_name = %(domain)s)", domain=private_suffix, )
def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool( self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList()
def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid")
def createdataset(type="train", AGD_file="../data_sets/split_AGDs", BD_file="../data_sets/split_benign_nx.json", datasetname="nx_train_data"): if type == "train": v_index = 0 else: v_index = 1 psl = PublicSuffixList() with open(AGD_file, "r") as f: AGD_dict = json.loads(f.read()) with open(BD_file, "r") as f: bd_dict = json.loads(f.read()) allAGDs = set() allBDs = set() for k, v in AGD_dict.items(): for d in v[v_index]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allAGDs.add(l) for d in bd_dict[type]: pre_d = d[:d.rindex(psl.publicsuffix(d)) - 1] for l in pre_d.split("."): allBDs.add(l) length = len(allAGDs) print(length) allBDs = list(allBDs)[:length] allAGDs = list(allAGDs) alldomains = allAGDs + allBDs alllabels = list(np.ones(length)) + list(np.zeros(length)) allfeatures = extract_all_features(alldomains) np.save("../data_sets/{}_features.npy".format(datasetname), allfeatures) data = dict() data["domains"] = pd.Series(alldomains, dtype='str') data["labels"] = pd.Series(alllabels, dtype='int32') df = pd.DataFrame(data=data) df.to_csv("../data_sets/{}.csv".format(datasetname), index=False)
def static_1_2(root_dir="/home/public/2019-01-07-dgarchive_full"): psl=PublicSuffixList() result=dict() for filename in os.listdir(root_dir): df = pd.read_csv(os.path.join(root_dir,filename),header=None,error_bad_lines=False) domains = result.get(filename) if domains is None: domains = [set(), set()] result[filename] = domains for d in df.iloc[:,0]: pub_d=psl.publicsuffix(d) if d != pub_d: d_split=d[:d.rindex(pub_d)-1].split(".") if len(d_split)==1: result.get(filename)[0].add(d) elif len(d_split)==2: result.get(filename)[1].add(d) else: print("Wow : {}".format(d)) print("{} finish".format(filename)) print("write") with open("../result_data/dga_data.json","w") as f: f.write(json.dumps(result,cls=MyJsonEncoder))
def dga_static_num(file_path): psl = PublicSuffixList() result = [0, 0, 0] with open(file_path, "r") as f: map = json.loads(f.read()) for k, v in map.items(): for d in v[0]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 for d in v[1]: d_strip = d[:d.rindex(psl.publicsuffix(d)) - 1].split(".") if len(d_strip) == 1: result[0] += 1 elif len(d_strip) == 2: result[1] += 1 else: result[2] += 1 print(result)
def getBenign(filepath): psl=PublicSuffixList() filter=Filter() domains=[] # out=dict() # with open(filepath,"r") as f: # for r in f: # r_split=r.strip().split(":") # if filter.inWhiteList(r_split[0]): # pri=psl.privatesuffix(r_split[0]) # lll=out.get(pri) # if lll is None: # lll=[] # lll.append(r_split[0]) # out[pri]=lll # continue # domains.append(r_split[0]) # # # num=0 # break_flag=False # for i in range(9): # for k,v in out.items(): # if i>=len(v) or k in ["aliyunduncc.com","360wzb.cn","yundunwaf.com","bugtags.com","wscloudcdn.com","ourdvsss.com","aliyundunwaf.com","aligfwaf.com"]: # continue # domains.append(v[i]) # num+=1 # if num>=311: # break_flag=True # break # if break_flag: # break with open(filepath,"r") as f: for r in f: r_split=r.strip().split(":") domains.append(r_split[0]) random.shuffle(domains) result=dict() result["train"]=domains[:23600] result["pred"]=domains[23600:29500] with open("../result_data/yd_nf_data.json","w") as f: f.write(json.dumps(result)) print(len(domains))
def check_hsts_preload(url: str) -> List[dict]: hsts_service = "https://hstspreload.com/api/v1/status/" results: List[dict] = [] domain = utils.get_domain(url) if not checkers.is_ip_address(domain): while domain.count(".") > 0: # get the HSTS preload status for the domain res, _ = network.http_json(f"{hsts_service}{domain}") results.append(res) domain = domain.split(".", 1)[-1] if PublicSuffixList().is_public(domain): break return results
class DomainSuffixExpertBot(Bot): suffixes = {} def init(self): self.field = self.parameters.field if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True) def process(self): event = self.receive_message() for space in ('source', 'destination'): key = '.'.join((space, self.field)) if key not in event: continue event['.'.join((space, 'domain_suffix'))] = self.psl.publicsuffix(domain=event[key]) self.send_message(event) self.acknowledge_message()
def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList()
class Master(BaseApplication): __flask = None @property def flask(self): if not self.__flask: raise ApplicationInitError("Cannot obtain server instance before init") return self.__flask @property def db(self): return self.__db def init(self, config=None): if config is not None: self.config = load_config_obj(config) app = Flask('product_identifier') app.config.from_object(self.config) self.__flask = app self.__db = SQLAlchemy(self.__flask) Migrate(self.__flask, self.db) self.handler_pool = gevent.pool.Pool(self.config.MASTER_HANDLER_POOL_SIZE) with open(os.path.join(self.config.DATA_DIR, "ruleset.json"), "r") as f: rule_set = json.load(f) self.product_patterns = [] for name, pattern in rule_set["rules"].iteritems(): self.product_patterns.append(re.compile(pattern)) self.__psl = PublicSuffixList() def start(self): def handleURL(): from product_identifier.models import URL db = Master.instance().db self.flask.logger.debug("JOB STARTED") while True: try: # TODO: succeptible to concurrency problems in_url = self.scripts.pop_zset(keys=[URLS_TO_PROCESS_SET]) if in_url: if not self.redis.sismember(PROCESSED_URLS_SET, in_url): self.flask.logger.debug("PROCESSING: {}".format(in_url)) uri = furl(in_url) domain = self.__psl.suffix(uri.host) try: p = URL() p.domain = domain p.url = in_url p.is_product = self.is_product_url(in_url) db.session.add(p) db.session.commit() except: db.session.rollback() error = traceback.format_exc() self.flask.logger.error("DB_ERROR: {}".format(error)) self.redis.sadd(DB_ERRORED_URL_SET, in_url) self.redis.sadd(PROCESSED_URLS_SET, in_url) self.redis.rpush(domain, in_url) domain_added = self.redis.sadd(DOMAINS_SET, domain) if domain_added: self.flask.logger.info("ADDED DOMAIN: {}".format(domain)) else: self.flask.logger.debug("SKIPPING: {}".format(in_url)) else: # no results, sleep gevent.sleep(1) except: error = traceback.format_exc() self.flask.logger.error("ERROR: {}".format(error)) for i in range(self.config.MASTER_HANDLER_POOL_SIZE): self.handler_pool.spawn(handleURL) def is_product_url(self, url): print url return any([pat.match(url) for pat in self.product_patterns])
def setUp(self): self.psl = PublicSuffixList()
class TestPSL(unittest.TestCase): def setUp(self): self.psl = PublicSuffixList() def test_typesafe(self): self.assertEqual(self.psl.suffix("www.example.co.jp").__class__, "example.co.jp".__class__) self.assertEqual(self.psl.suffix(u("www.example.co.jp")).__class__, u("example.co.jp").__class__) self.assertEqual(self.psl.publicsuffix("www.example.co.jp").__class__, "co.jp".__class__) self.assertEqual(self.psl.publicsuffix(u("www.example.co.jp")).__class__, u("co.jp").__class__) def test_uppercase(self): self.assertEqual(self.psl.suffix("wWw.eXaMpLe.cO.Jp"), "example.co.jp") self.assertEqual(self.psl.publicsuffix("wWw.eXaMpLe.cO.Jp"), "co.jp") def test_invaliddomain(self): self.assertEqual(self.psl.suffix("www..invalid"), None) self.assertEqual(self.psl.suffix(".example.com"), None) self.assertEqual(self.psl.suffix("example.com."), None) self.assertEqual(self.psl.suffix(""), None) self.assertEqual(self.psl.publicsuffix("www..invalid"), None) self.assertEqual(self.psl.publicsuffix(".example.com"), None) self.assertEqual(self.psl.publicsuffix("example.com."), None) self.assertEqual(self.psl.publicsuffix(""), None) def test_idn(self): tld = u("香港") self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_punycoded(self): tld = encode_idn(u("香港")) self.assertEqual(self.psl.suffix(u("www.example.") + tld), u("example.") + tld) self.assertEqual(self.psl.publicsuffix(u("www.example.") + tld), tld) def test_suffix_deny_public(self): self.assertEqual(self.psl.suffix("com"), None) self.assertEqual(self.psl.suffix("co.jp"), None) self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) def test_unknown(self): self.assertEqual(self.psl.suffix("www.example.unknowntld"), "example.unknowntld") self.assertEqual(self.psl.suffix("unknowntld"), None) self.assertEqual(self.psl.publicsuffix("www.example.unknowntld"), "unknowntld") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_deny_unknown(self): source = """ known """ psl = PublicSuffixList(source.splitlines(), accept_unknown=False) self.assertEqual(psl.suffix("www.example.unknowntld"), None) def test_custom_psl(self): source = """ invalid *.invalid !test.invalid """ psl = PublicSuffixList(source.splitlines()) self.assertEqual(psl.suffix("example.invalid"), None) self.assertEqual(psl.suffix("test.invalid"), "test.invalid") self.assertEqual(psl.suffix("some.test.invalid"), "test.invalid") self.assertEqual(psl.suffix("aaa.bbb.ccc.invalid"), "bbb.ccc.invalid") self.assertEqual(psl.publicsuffix("example.invalid"), "example.invalid") self.assertEqual(psl.publicsuffix("test.invalid"), "invalid") def test_publicsuffix(self): self.assertEqual(self.psl.publicsuffix("www.example.com"), "com") self.assertEqual(self.psl.publicsuffix("unknowntld"), "unknowntld") def test_wildcard(self): self.assertEqual(self.psl.suffix("test.example.nagoya.jp"), "test.example.nagoya.jp") self.assertEqual(self.psl.suffix("example.nagoya.jp"), None) self.assertEqual(self.psl.publicsuffix("example.nagoya.jp"), "example.nagoya.jp") self.assertEqual(self.psl.publicsuffix("test.example.nagoya.jp"), "example.nagoya.jp") def test_checkpublicsuffix_script(self): regex = re.compile(r"^checkPublicSuffix\(('[^']+'), (null|'[^']+')\);") with open(os.path.join(os.path.dirname(__file__), "test_psl.txt"), "rb") as f: ln = 0 for line in f: ln += 1 l = line.decode("utf-8") m = regex.match(l) if not m: continue arg = m.group(1).strip("'") res = None if m.group(2) == "null" else m.group(2).strip("'") self.assertEqual(self.psl.suffix(arg), res, "in line {0}: {1}".format(ln, line.strip())) def test_typeerror(self): self.assertRaises(TypeError, lambda: self.psl.suffix(None)) self.assertRaises(TypeError, lambda: self.psl.suffix(1)) if b("") != "": # python3 self.assertRaises(TypeError, lambda: self.psl.suffix(b("www.example.com"))) def test_compatclass(self): from publicsuffixlist.compat import PublicSuffixList psl = PublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "") self.assertEqual(psl.get_public_suffix(""), "") def test_unsafecompatclass(self): from publicsuffixlist.compat import UnsafePublicSuffixList psl = UnsafePublicSuffixList() self.assertEqual(psl.get_public_suffix("test.example.com"), "example.com") self.assertEqual(psl.get_public_suffix("com"), "com") self.assertEqual(psl.get_public_suffix(""), "") def test_toomanylabels(self): d = "a." * 1000000 + "example.com" self.assertEqual(self.psl.publicsuffix(d), "com") self.assertEqual(self.psl.privatesuffix(d), "example.com") def test_flatstring(self): psl = PublicSuffixList(u("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com") def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")
def test_flatbytestring(self): psl = PublicSuffixList(b("com\nnet\n")) self.assertEqual(psl.publicsuffix("example.com"), "com")
def init(self): self.field = self.parameters.field if self.field not in ALLOWED_FIELDS: raise InvalidArgument('key', got=self.field, expected=ALLOWED_FIELDS) with codecs.open(self.parameters.suffix_file, encoding='UTF-8') as file_handle: self.psl = PublicSuffixList(source=file_handle, only_icann=True)
def subresource_integrity(reqs: dict, expectation='sri-implemented-and-external-scripts-loaded-securely') -> dict: """ :param reqs: dictionary containing all the request and response objects :param expectation: test expectation sri-implemented-and-all-scripts-loaded-securely: all same origin, and uses SRI sri-implemented-and-external-scripts-loaded-securely: integrity attribute exists on all external scripts, and scripts loaded [default for HTML] sri-implemented-but-external-scripts-not-loaded-securely: SRI implemented, but with scripts loaded over HTTP sri-not-implemented-but-external-scripts-loaded-securely: SRI isn't implemented, but all scripts are loaded over HTTPS sri-not-implemented-and-external-scripts-not-loaded-securely: SRI isn't implemented, and scripts are downloaded over HTTP sri-not-implemented-but-all-scripts-loaded-from-secure-origin: SRI isn't implemented, but all scripts come from secure origins (self) sri-not-implemented-but-no-scripts-loaded: SRI isn't implemented, because the page doesn't load any scripts sri-not-implemented-response-not-html: SRI isn't needed, because the page isn't HTML [default for non-HTML] request-did-not-return-status-code-200: Only look for SRI on pages that returned 200, not things like 404s html-not-parsable: Can't parse the page's content :return: dictionary with: data: all external scripts and their integrity / crossorigin attributes expectation: test expectation pass: whether the site's external scripts met expectations result: short string describing the result of the test """ output = { 'data': {}, 'expectation': expectation, 'pass': False, 'result': None, } response = reqs['responses']['auto'] # The order of how "good" the results are goodness = ['sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-implemented-but-external-scripts-not-loaded-securely', 'sri-not-implemented-but-external-scripts-loaded-securely', 'sri-not-implemented-and-external-scripts-not-loaded-securely', 'sri-not-implemented-response-not-html'] # If the response to get / fails if response.status_code != 200: output['result'] = 'request-did-not-return-status-code-200' # If the content isn't HTML, there's no scripts to load; this is okay elif response.headers.get('Content-Type', '').split(';')[0] not in ('text/html', 'application/xhtml+xml'): output['result'] = 'sri-not-implemented-response-not-html' else: # Try to parse the HTML try: soup = bs(reqs['resources']['/'], 'html.parser') except: output['result'] = 'html-not-parsable' return output # Track to see if any scripts were on foreign TLDs scripts_on_foreign_origin = False # Get all the scripts scripts = soup.find_all('script') for script in scripts: if script.has_attr('src'): # Script tag parameters src = urlparse(script['src']) integrity = script.get('integrity') crossorigin = script.get('crossorigin') # Check to see if they're on the same second-level domain # TODO: update the PSL list on startup psl = PublicSuffixList() samesld = True if (psl.privatesuffix(urlparse(response.url).netloc) == psl.privatesuffix(src.netloc)) else False # Check to see if it's the same origin or second-level domain if src.netloc == '' or samesld: secureorigin = True elif src.netloc != '' and '.' not in src.netloc: # like localhost secureorigin = False scripts_on_foreign_origin = True else: secureorigin = False scripts_on_foreign_origin = True # See if it's a secure scheme if src.scheme == 'https' or (src.scheme == '' and urlparse(response.url).scheme == 'https'): securescheme = True else: securescheme = False # Add it to the scripts data result, if it's not a relative URI if not secureorigin: output['data'][script['src']] = { 'crossorigin': crossorigin, 'integrity': integrity } if integrity and not securescheme: output['result'] = only_if_worse('sri-implemented-but-external-scripts-not-loaded-securely', output['result'], goodness) elif not integrity and securescheme: output['result'] = only_if_worse('sri-not-implemented-but-external-scripts-loaded-securely', output['result'], goodness) elif not integrity and not securescheme: output['result'] = only_if_worse('sri-not-implemented-and-external-scripts' '-not-loaded-securely', output['result'], goodness) # Grant bonus even if they use SRI on the same origin else: if integrity and securescheme and not output['result']: output['result'] = 'sri-implemented-and-all-scripts-loaded-securely' # If the page doesn't load any scripts if not scripts: output['result'] = 'sri-not-implemented-but-no-scripts-loaded' # If all the scripts are loaded from a secure origin, not triggering a need for SRI elif scripts and not scripts_on_foreign_origin and not output['result']: output['result'] = 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin' # If the page loaded from a foreign origin, but everything included SRI elif scripts and scripts_on_foreign_origin and not output['result']: output['result'] = only_if_worse('sri-implemented-and-external-scripts-loaded-securely', output['result'], goodness) # Code defensively on the size of the data output['data'] = output['data'] if len(str(output['data'])) < 32768 else {} # Check to see if the test passed or failed if output['result'] in ('sri-implemented-and-all-scripts-loaded-securely', 'sri-implemented-and-external-scripts-loaded-securely', 'sri-not-implemented-response-not-html', 'sri-not-implemented-but-all-scripts-loaded-from-secure-origin', 'sri-not-implemented-but-no-scripts-loaded', expectation): output['pass'] = True return output