def score_domain(domain): """Score `domain`. The highest score, the most probable `domain` is a phishing site. Args: domain (str): the domain to check. Returns: int: the score of `domain`. """ score = 0 for t in suspicious['tlds']: if domain.endswith(t): score += 20 # Remove initial '*.' for wildcard certificates bug if domain.startswith('*.'): domain = domain[2:] # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) domain = '.'.join([res.subdomain, res.domain]) except Exception: pass # Higer entropy is kind of suspicious score += int(round(entropy(domain)*10)) # Remove lookalike characters using list from http://www.unicode.org/reports/tr39 domain = unconfuse(domain) words_in_domain = re.split("\W+", domain) # ie. detect fake .com (ie. *.com-account-management.info) if words_in_domain[0] in ['com', 'net', 'org']: score += 10 # Testing keywords for word in suspicious['keywords']: if word in domain: score += suspicious['keywords'][word] # Testing Levenshtein distance for strong keywords (>= 70 points) (ie. paypol) for key in [k for (k,s) in suspicious['keywords'].items() if s >= 70]: # Removing too generic keywords (ie. mail.domain.com) for word in [w for w in words_in_domain if w not in ['email', 'mail', 'cloud']]: if distance(str(word), str(key)) == 1: score += 70 # Lots of '-' (ie. www.paypal-datacenter.com-acccount-alert.com) if 'xn--' not in domain and domain.count('-') >= 4: score += domain.count('-') * 3 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if domain.count('.') >= 3: score += domain.count('.') * 3 return score
def score_domain(config, domain, args): """ """ # dbugger = ['------------------------------------------------------------'] # dbugger.append(domain) score = 0 for t in config["tlds"]: if domain.endswith(t): score += 20 # dbugger.append("TLD: {}".format(t)) try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if res is not None: domain = '.'.join([res.subdomain, res.domain]) except Exception as err: failed_message(args, err, domain) pass score += int(round(entropy.shannon_entropy(domain)*50)) # dbugger.append("Entropy: {}".format(int(round(entropy.shannon_entropy(domain)*50)))) domain = unconfuse(domain) words_in_domain = re.split(r"\W+", domain) if words_in_domain[0] in ["com", "net", "org"]: score += 10 # dbugger.append("Com-net-org: {}".format(words_in_domain[0])) for word in config["keywords"]: if word in domain: score += config["keywords"][word] # dbugger.append("Keyword: {}".format(len(config["keywords"]))) # dbugger.append("Keyword: {}".format(word)) for key in [k for (k,s) in config["keywords"].items() if s >= 70]: for word in [w for w in words_in_domain if w not in ["email", "mail", "cloud"]]: if distance(str(word), str(key)) == 1: score += 70 # dbugger.append("Distance: {}, {}".format(str(word), str(key))) if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 # dbugger.append("Count dashes: {}".format(domain.count("."))) if domain.count(".") >= 3: score += domain.count(".") * 3 # dbugger.append("Count period: {}".format(domain.count("."))) # dbugger.append("\nScore: {}".format(score)) # dbugger.append('------------------------------------------------------------') # with open("dbug_file", "a") as dbug_file: # for dbug in dbugger: # dbug_file.write("{}\n".format(dbug)) return score
def callback(message, context): """Callback handler for certstream events.""" if message['message_type'] == "heartbeat": return if message['message_type'] == "certificate_update": all_domains = message['data']['leaf_cert']['all_domains'] force_break = False for domain in all_domains: pbar.update(1) current_domain = domain.lower() uc_current_domain = unconfuse(current_domain) for ignore in keywords_ignore: if ignore in current_domain or ignore in uc_current_domain: force_break = True break if force_break: break for alert in keywords_alert: if alert in current_domain or alert in uc_current_domain: not_before = datetime.utcfromtimestamp( int(message['data']['leaf_cert'] ['not_after'])).strftime('%Y-%m-%d %H:%M:%S') not_after = datetime.utcfromtimestamp( int(message['data']['leaf_cert'] ['not_after'])).strftime('%Y-%m-%d %H:%M:%S') serial_number = message['data']['leaf_cert'][ 'serial_number'] fingerprint = message['data']['leaf_cert']['fingerprint'] ca = message['data']['chain'][0]['subject']['aggregated'] tqdm.tqdm.write("[+] Match found {}".format( colored(current_domain, 'red', attrs=['underline', 'bold']))) tqdm.tqdm.write(" Matched on {}".format(alert)) tqdm.tqdm.write( " [Details] Not before {}, Not after {}, Serial {}" .format(not_before, not_after, serial_number)) tqdm.tqdm.write( " [Details] Fingerprint {}, CA {}".format( fingerprint, ca)) with open(log_suspicious, 'a') as f: now = time.strftime("%Y-%m-%d %H:%M:%S") f.write("{},{},{},{},{},{},{},{}\n".format( now, alert, current_domain, not_before, not_after, serial_number, fingerprint, ca))
def score_domain(suspicious, domain, args): """ """ score = 0 for t in suspicious["tlds"]: if domain.endswith(t): score += 20 try: res = get_tld(domain, as_object=True, fail_silently=True, fix_protocol=True) if res is not None: domain = '.'.join([res.subdomain, res.domain]) except Exception as err: failed_message(args, err, domain) pass score += int(round(entropy.shannon_entropy(domain) * 50)) domain = unconfuse(domain) words_in_domain = re.split(r"\W+", domain) if words_in_domain[0] in ["com", "net", "org"]: score += 10 for word in suspicious["keywords"]: if word in domain: score += suspicious["keywords"][word] for key in [k for (k, s) in suspicious["keywords"].items() if s >= 70]: for word in [ w for w in words_in_domain if w not in ["email", "mail", "cloud"] ]: if distance(str(word), str(key)) == 1: score += 70 if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def score_domain(provided_ioc): """Return the scores of the provided domain.""" score = 0 for suspicious_tld in suspicious["tlds"]: if provided_ioc.endswith(suspicious_tld): score += 20 try: res = tld.get_tld(provided_ioc, as_object=True, fail_silently=True, fix_protocol=True) domain = ".".join([res.subdomain, res.domain]) except Exception: domain = provided_ioc score += int(round(entropy.shannon_entropy(domain) * 50)) domain = confusables.unconfuse(domain) words_in_domain = re.split("\W+", domain) if domain.startswith("*."): domain = domain[2:] if words_in_domain[0] in ["com", "net", "org"]: score += 10 for word in suspicious["keywords"]: if word in domain: score += suspicious["keywords"][word] for key in [k for k, v in suspicious["keywords"].items() if v >= 70]: for word in [ w for w in words_in_domain if w not in ["email", "mail", "cloud"] ]: if pylev.levenshtein(str(word), str(key)) == 1: score += 70 if "xn--" not in domain and domain.count("-") >= 4: score += domain.count("-") * 3 if domain.count(".") >= 3: score += domain.count(".") * 3 return score
def enrich(self): # each of the internal methods that gets more data. there is a set sequence on these # self.__get_dns_data() # pprint.pprint((self.dns_records)) # self.__get_ip2cidr() # pprint.pprint((self.asn_records)) # self.__get_whois() this is too slow. we need to find a reason to run who_is # pprint.pprint(self.whois_records) # not working ??? # let's see which score high enough to look into score = 0 for t in TLDS: if self.domain.endswith(t): score += 20 # Removing TLD to catch inner TLD in subdomain (ie. passportoffice.gov.uk-otherdomain.com) try: res = get_tld(self.domain, as_object=True, fail_silently=True, fix_protocol=True) domain_without_outer_tld = '.'.join([res.subdomain, res.domain]) except Exception: domain_without_outer_tld = "" pass self.__get_entropy() # TODO this is a lazy calculation. Should we should base it on actual entropy? score += int(round(self.shannon_entropy * 10)) # Remove lookalike characters using list from http://www.unicode.org/reports/tr39 unconfused_domain_without_tld = unconfuse(domain_without_outer_tld) #print(f"domain:{self.domain},innertld:{domain_without_outer_tld},unconfuse:{unconfused_domain_without_tld}") # at this point we've stripped off the outer TLD and unconfused the domain. words_in_domain = re.split("\W+", unconfused_domain_without_tld) # look for fake, hidden .coms ie. govuk.com-account-management.info if words_in_domain[0] in ['com', 'net', 'org']: score += 10 # score the word based on triggerwords.yaml for word in SCORES: if word in unconfused_domain_without_tld: score += SCORES[word] #================ # TODO calculate score based on lev # do we even need this check, since we're not using the output below.... self.__get_levenshtein() # TODO- figure out # compare the domain against the strong match words # aka hmrc, taxrefund, dvla for key in [k for (k, s) in SCORES.items() if s >= 70]: # this isn't going to be effecticve for overly generic words like 'email' for word in [ w for w in words_in_domain if w not in ['email', 'mail', 'cloud'] ]: #LOG.info(f"word:{word}") #LOG.info(f"key:{key}") if Levenshtein.distance(str(word), str(key)) == 1: score += 70 # Lots of hyphens '-' if 'xn--' not in self.domain and self.domain.count('-') >= 4: score += self.domain.count('-') * 3 # Deeply nested subdomains (ie. www.paypal.com.security.accountupdate.gq) if self.domain.count('.') >= 3: score += self.domain.count('.') * 3 self.score = score #print( # f"domain:{self.domain},score:{score},matched on:{self.match},entropy:{self.shannon_entropy},levenshtein ratio:{self.levenshtein_ratio},Levenshtein distance:{self.levenshtein_distance}") # we need to decide how to filter this list. # many won't be malicious """ Lev when we want to analyse these... > 0.8 : Red? > 0.4 : Amber? <0.4 : green? """ """ shannon >4: red 3.5-4: amber """ # malicious TLD - add 20 # # Removing TLD to catch inner TLD in subdomain (ie. paypal.com.domain.com) """