def addToTable(text, order): global table, size tokens = word_tokenize(text) # Make the index table for i in range(len(tokens) - order): sub = d.detokenize(tokens[i:i + order]) if gib_detect_train.avg_transition_prob(sub, model_mat) < threshold: continue if not sub in table: table[sub] = {} table[sub]["SIZE"] = 0 size += 1 # Count the following strings for each string for j in range(len(tokens) - order - order): index = d.detokenize(tokens[j:j + order]) if gib_detect_train.avg_transition_prob(index, model_mat) < threshold: continue k = j + order following = d.detokenize(tokens[k:k + order]) if gib_detect_train.avg_transition_prob(following, model_mat) < threshold: continue if not following in table[index] and len(following) > 0: table[index][following] = 1 table[index]["SIZE"] += 1 elif len(following) > 0: table[index][following] += 1 table[index]["SIZE"] += 1
def check_word_random(self, word): if gib_detect_train.avg_transition_prob( word, self.model_mat) < self.threshold: return 1 else: return 0
def count_gib(labels): #Count gibberish labels based on a trained model count = 0 for label in labels: if not gib_detect_train.avg_transition_prob(label, model_mat) > threshold: count += 1 return count
def markov_score(temp): global model_data # query = temp model_mat = model_data['mat'] # threshold = model_data['thresh'] sum = (gib_detect_train.avg_transition_prob(temp, model_mat)) return sum
def parse(self, words): keywords_in_words = [] brands_in_words = [] similar_to_brands = [] similar_to_keywords = [] dga_in_words = [] len_gt_7 = [] len_lt_7 = [] try: for word in words: word = re.sub("\d+", "", word) if word in self.keywords: keywords_in_words.append(word) elif word in self.allbrand: brands_in_words.append(word) elif self.__is_similar_to_any_element(word, self.allbrand) != 0: target = self.__is_similar_to_any_element( word, self.allbrand) similar_to_brands.append(target) elif self.__is_similar_to_any_element(word, self.keywords) != 0: target = self.__is_similar_to_any_element( word, self.keywords) similar_to_keywords.append(target) elif len(word) > 3 and not word.isnumeric(): if (gib_detect_train.avg_transition_prob( word, self.model_mat) > self.threshold) == False: dga_in_words.append(word) # todo keyword benzeri olanlar temizlenmeli elif len(word) < 7: len_lt_7.append(word) else: len_gt_7.append(word) result = { 'keywords_in_words': keywords_in_words, 'brands_in_words': brands_in_words, 'dga_in_words': dga_in_words, 'len_lt_7': len_lt_7, 'len_gt_7': len_gt_7, 'similar_to_brands': similar_to_brands, 'similar_to_keywords': similar_to_keywords } except: self.logger.debug(str(words) + " işlenirken hata") self.logger.error("Error : {0}".format(format_exc())) return result
def check_gib(temp): global model_data query = temp model_mat = model_data['mat'] threshold = model_data['thresh'] sum = (gib_detect_train.avg_transition_prob(temp, model_mat)) return sum
def test(): csv = pd.read_csv('Data.csv') domain = csv['domain'].values res = [] model_data = pickle.load(open('gib_model.pki', 'rb')) for i in range(len(domain)): model_mat = model_data['mat'] dom = domain[i].split('.')[0] print(domain[i]) print(gib_detect_train.avg_transition_prob(dom, model_mat))
def detect(text): model_data = pickle.load(open('helpers/GibberishDetector/gib_model.pki', 'rb')) l = text # l = raw_input() model_mat = model_data['mat'] threshold = model_data['thresh'] # print gib_detect_train.avg_transition_prob(l, model_mat) > threshold return gib_detect_train.avg_transition_prob(l, model_mat) > threshold
def check_word(wordList): '''Returns count of gibberish present in the list of words''' model_data = pickle.load(open('gib_model.pki', 'rb')) model_mat = model_data['mat'] threshold = model_data['thresh'] count = 0 for word in wordList: l = word if gib_detect_train.avg_transition_prob(l, model_mat) <= threshold: count += 1 return count
def apply_name_filters(pdf): # name length pdf['name_len'] = pdf['name'].map(lambda x: len(x)) # name gibberish score name_model_data = pickle.load(open('../private/gib_model.pki', 'rb')) name_model_mat = name_model_data['mat'] gib_score = lambda x: gib_detect_train.avg_transition_prob( x, name_model_mat) pdf['name_gibberish_score'] = pdf['name'].map(gib_score) # name vowel ratio pdf['name_vowel_ratio'] = pdf['name'].apply(Preprocessor.vowel_ratio) return pdf
def processDecades(): prevPrevData = [] prevData = [] repeats = set() for decade in range(1900, 2020, 10): print("processing", str(decade)) directory = os.fsencode("decades_modified/" + str(decade)) for file in os.listdir(directory): filename = os.fsdecode(file) readf = codecs.open("decades_modified/" + str(decade) + "/" + filename, 'r', encoding="cp1252", errors="replace") filedata = readf.read() readf.close() processed = processText(filedata) lines = processed.split("\n") reprocessed = "" for s in lines: if len(s) < 4 or gib_detect_train.avg_transition_prob( s, model_mat) < threshold or "......." in s: pass # elif s in prevData or s in prevPrevData: # repeats.add(s) else: reprocessed += s + "\n" output = reprocessed # sents = nltk.sent_tokenize(reprocessed) # output = "" # for s in sents: # if len(s) < 4 or gib_detect_train.avg_transition_prob(s, model_mat) < threshold: # pass # elif s in prevData or s in prevPrevData: # repeats.add(s) # else: # output += s + " " writef = codecs.open("decades_modified/" + str(decade) + "/" + filename, 'w+', encoding="utf-8", errors='ignore') writef.write(output) writef.close()
def random_callback_name(flow): """ This is the main function that do the work """ name = None if (flow.dns_info): name = str(flow.dns_info.domain_name) elif (flow.ssl_info): name = str(flow.ssl_info.server_name) elif (flow.http_info): name = str(flow.http_info.host_name) name = name[:-4] if (name.startswith("www.")): name = name[4:] if (name): value = gib_detect_train.avg_transition_prob(name, model_mat) > threshold if (value == False): print("WARNING:%s:%s Unknown malware detected" % (flow.l7_protocol_name, name))
vowel_ratio = count_vowels(main_domain) / f_len digit_ratio = count_digits(main_domain) / f_len repeat_letter = count_repeat_letter(main_domain) / f_len consec_digit = consecutive_digits(main_domain) / f_len consec_consonant = consecutive_consonant(main_domain) / f_len # probability of staying in the markov transition matrix (trained by Alexa) hmm_prob_ = hmm_prob(hmm_main_domain) # probability is too low to be non-DGA if hmm_prob_ < math.e**hmm_prob_threshold: hmm_log_prob = -999. else: hmm_log_prob = math.log(hmm_prob_) # advanced linguistic feature: pronouncable domain gib_value = int( gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat) > threshold) try: fw.write( '%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n' % (domain, tld, entropy, f_len, entropy / f_len, vowel_ratio, digit_ratio, repeat_letter, consec_digit, consec_consonant, gib_value, hmm_log_prob, ave(unigram_rank), ave(bigram_rank), ave(trigram_rank), std(unigram_rank), std(bigram_rank), std(trigram_rank), has_private_tld)) except UnicodeEncodeError: continue fw.close() fi.close()
import pandas as pd import numpy as np import pickle import gib_detect_train df = pd.read_csv("orig-data.csv") model_data = pickle.load(open('../private/gib_model.pki', 'rb')) model_mat = model_data['mat'] gib_score = lambda x : gib_detect_train.avg_transition_prob(x, model_mat) df['name_len'] = df['display_name'].map(lambda x : len(x)) df['name_gibberish_score'] = df['display_name'].map(gib_score) def vowel_ratio(x): count = 0 for c in x.lower(): if c in ['a', 'e', 'i', 'o', 'u']: count=count+1 return float(count)/float(len(x)) df['name_vowel_ratio'] = df['display_name'].apply(vowel_ratio) import datetime def get_day_of_week(x): date_str = x[:10] datetime_obj = datetime.datetime.strptime(date_str, '%Y-%m-%d') return datetime_obj.weekday()
def HMM(domain): model_data = pickle.load(open('gib_model.pki', 'rb')) model_mat = model_data['mat'] str = domain.split('.')[0] hmm = gib_detect_train.avg_transition_prob(str, model_mat) return hmm
def score(self,input_str): score=gib_detect_train.avg_transition_prob(input_str, self.model_mat, self.threshold) isgib=(score>self.threshold) return score, isgib
# 2. N-Gram Feature unigram_rank = np.array( [gram_rank_dict[i] if i in gram_rank_dict else 0 for i in host[1:-1]]) bigram_rank = np.array([ gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in bigrams(host) ]) # extract the bigram trigram_rank = np.array([ gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in trigrams(host) ]) # extract the bigram # 3. Gib_value Feature gib_value = int( gib_detect_train.avg_transition_prob(host, model_mat) > threshold) try: fw.write( '%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' % ( url, is_malicious, cctld_num, entropy1, f_len1, entropy1 / f_len1, # vowel_ratio,digit_ratio,repeat_letter,consec_digit,consec_consonant, ave(unigram_rank), ave(bigram_rank), ave(trigram_rank),
#!/usr/bin/python import pickle import gib_detect_train import os import sys gib_model_file = os.path.join(os.path.dirname(__file__), 'gib_model.pki') if not os.path.isfile(gib_model_file): gib_detect_train.train() model_data = pickle.load(open(gib_model_file, 'rb')) model_mat = model_data['mat'] threshold = model_data['thresh'] def is_gibberish(string): return gib_detect_train.avg_transition_prob(string, model_mat, threshold) <= threshold if __name__ == "__main__": if len(sys.argv) > 1: sys.exit(not is_gibberish(sys.argv[1])) print('threshold: ' + str(threshold)) while True: input = raw_input() prob = gib_detect_train.avg_transition_prob(input, model_mat, threshold) print('prob: ' + str(prob)) print('gibberish? ' + str(is_gibberish(input)))
__FILENAME__ = gib_detect #!/usr/bin/python import pickle import gib_detect_train model_data = pickle.load(open('gib_model.pki', 'rb')) while True: l = raw_input() model_mat = model_data['mat'] threshold = model_data['thresh'] print gib_detect_train.avg_transition_prob(l, model_mat) > threshold ########NEW FILE######## __FILENAME__ = gib_detect_train #!/usr/bin/python import math import pickle accepted_chars = 'abcdefghijklmnopqrstuvwxyz ' pos = dict([(char, idx) for idx, char in enumerate(accepted_chars)]) def normalize(line): """ Return only the subset of chars from accepted_chars. This helps keep the model relatively small by ignoring punctuation, infrequenty symbols, etc. """ return [c.lower() for c in line if c.lower() in accepted_chars]
def is_word_gibberish(word): """Return the result from the training.""" return gib_detect_train.avg_transition_prob( word, model_mat) <= threshold
def get_candicate_host_domains(): days = ["20171104"] hours = ['%02d' % (hour) for hour in range(24)] mins = [ "00", "05", "10", "15", "20", "25", "30", "35", "40", "45", "50", "55" ] files = [] for day in days: for hour in hours: for minute in mins: files.append("/mnt/Work/DNS/data/nxdomains_per_5_mins/" + day + "/" + day + str(hour) + minute + ".txt") for file in files: print(file) day = file.split('/')[-2] if not os.path.exists("/mnt/Work/DNS1/results/" + day): os.system("mkdir -p /mnt/Work/DNS1/results/" + day) file_out = "/mnt/Work/DNS1/results/" + day + "/" + file.split('/')[-1] # print(file_out) results = defaultdict() sip_domains = defaultdict(lambda: set()) with open(file) as f: for row in f: items = row.strip().split(',') sip = items[0] domain = clean_string(items[3]) if have_other_characters(domain): continue if F.in_tld_whitelist(domain): continue # if F.in_CDN(domain): # continue if F.in_disposable(domain): continue if F.in_DHCP(domain): continue if not A.in_alexa_top(domain): sip_domains[sip].add(domain) count = 0 for sip, domains in sorted(sip_domains.items(), key=lambda x: len(x[1]), reverse=True): # main_domains = [tldextract.extract(domain_1).domain for domain_1 in domains] # main_domains = ['.'.join(tldextract.extract(domain_1)[:2]).replace('.', '') for domain_1 in domains] domain_value = { l: int( gib_detect_train.avg_transition_prob( tldextract.extract(l).domain, model_mat) > threshold) if len(tldextract.extract(l).domain) > 9 else 1 for l in domains } if len([k_v[0] for k_v in domain_value.items() if k_v[1] == 0]) > 2: # if Counter([int(gib_detect_train.avg_transition_prob(l, model_mat) > threshold ) if len(l) > 9 else 1 for l in main_domains])[0] > 2: # print("{}: {}".format(sip, domains)) results[sip] = [ k_v[0] for k_v in domain_value.items() if k_v[1] == 0 ] count += 1 print(count) for key in results: results[key] = sorted(results[key], key=lambda x: len(x)) with open(file_out, "w") as f_out: json.dump(results, f_out, sort_keys=True, indent=4)
list_topics = list_topics.split(",") degree = [] pure_degree = [] for i in range(len(list_topics)): if ("h" in list_topics[i]): res = re.sub("\d+", '', list_topics[i]) pattern = re.compile("[^\w ]") vare = pattern.sub('', res) j = vare.replace("h", '', 1) if (len(j) > 2): model_mat = model_data['mat'] threshold = model_data['thresh'] boz = gib_detect_train.avg_transition_prob(j, model_mat) > threshold if (boz == True): j = " ".join(j.split()) degree.append(j) d = dog("bolt://localhost:7687", "neo4j", "mathers22") degree = list(dict.fromkeys(degree)) print(degree) subject_er = 'Electromagnetics' subject_e = wikipedia.search(subject_er)[0] print(subject_e) # d.add_person(subject_e,degree) while ("" in degree): degree.remove("")
for s in lines: j = 0 for j in range(0, len(s)): if s[j] == '.': break temp = s[0:j] query = temp model_mat = model_data['mat'] threshold = model_data['thresh'] sum=(gib_detect_train.avg_transition_prob(temp, model_mat)) outF.write(str(sum)) outF.write("\n") count=count+1 print(count) if count == 1000: break outF.close()
#linguistic feature: % of vowels, % of digits, % of repeated letter, % consecutive digits and % non-'aeiou' vowel_ratio = count_vowels(main_domain)/f_len digit_ratio = count_digits(main_domain)/f_len repeat_letter = count_repeat_letter(main_domain)/f_len consec_digit = consecutive_digits(main_domain)/f_len consec_consonant = consecutive_consonant(main_domain)/f_len #probability of staying in the markov transition matrix (trained by Alexa) hmm_prob_ = hmm_prob(hmm_main_domain) if hmm_prob_<math.e**hmm_prob_threshold:#probability is too low to be non-DGA hmm_log_prob = -999. else: hmm_log_prob = math.log(hmm_prob_) #advanced linguistic feature: pronouncable domain gib_value = int(gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat) > threshold) try: fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n' %(domain,cla,tld,entropy,f_len,entropy/f_len,vowel_ratio, digit_ratio,repeat_letter,consec_digit,consec_consonant,gib_value,hmm_log_prob, ave(unigram_rank),ave(bigram_rank),ave(trigram_rank), std(unigram_rank),std(bigram_rank),std(trigram_rank), has_private_tld) ) except UnicodeEncodeError: continue #fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' # %(domain,cla,tld,entropy,f_len,entropy/f_len,unigram_rank.mean(),bigram_rank.mean(),trigram_rank.mean(), # unigram_rank.std(),bigram_rank.std(),trigram_rank.std())) #fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%s\t%s\n'%(domain,cla,tld,entropy,f_len,entropy/f_len,','.join(bigram),','.join(trigram)))
#!/usr/bin/python import pickle import gib_detect_train model_data = pickle.load(open('gib_model.pki', 'rb')) # while True: # # l = raw_input() model_mat = model_data['mat'] threshold = model_data['thresh'] print threshold print gib_detect_train.avg_transition_prob("000000000027", model_mat)
def getRating(self, string): return gib_detect_train.avg_transition_prob(string, self.model_mat) > self.threshold
def is_gibberish(string): return gib_detect_train.avg_transition_prob(string, model_mat, threshold) <= threshold
def getScore(self, string): return round(gib_detect_train.avg_transition_prob(string, self.model_mat), 2)
# linguistic feature: % of vowels, % of digits, % of repeated letter, % consecutive digits and % non-'aeiou' vowel_ratio = count_vowels(main_domain)/f_len digit_ratio = count_digits(main_domain)/f_len repeat_letter = count_repeat_letter(main_domain)/f_len consec_digit = consecutive_digits(main_domain)/f_len consec_consonant = consecutive_consonant(main_domain)/f_len # probability of staying in the markov transition matrix (trained by Alexa) hmm_prob_ = hmm_prob(hmm_main_domain) if hmm_prob_<math.e**hmm_prob_threshold: # probability is too low to be non-DGA hmm_log_prob = -999. else: hmm_log_prob = math.log(hmm_prob_) # advanced linguistic feature: pronouncable domain gib_value = int(gib_detect_train.avg_transition_prob(main_domain.strip('$'), model_mat) > threshold) try: fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%d\n' %(domain,cla,tld,entropy,f_len,entropy/f_len,vowel_ratio, digit_ratio,repeat_letter,consec_digit,consec_consonant,gib_value,hmm_log_prob, ave(unigram_rank),ave(bigram_rank),ave(trigram_rank), std(unigram_rank),std(bigram_rank),std(trigram_rank), has_private_tld) ) except UnicodeEncodeError: continue # fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\t%.2f\n' # %(domain,cla,tld,entropy,f_len,entropy/f_len,unigram_rank.mean(),bigram_rank.mean(),trigram_rank.mean(), # unigram_rank.std(),bigram_rank.std(),trigram_rank.std())) # fw.write('%s\t%s\t%s\t%.3f\t%.1f\t%.3f\t%s\t%s\n'%(domain,cla,tld,entropy,f_len,entropy/f_len,','.join(bigram),','.join(trigram)))
if str(line)[i].isdigit(): if str(line)[i+1].isdigit(): consdig += 1 if str(line)[i].isalpha() and str(line)[i] not in 'aeiouAEIOU': if str(line)[i+1].isalpha() and str(line)[i+1] not in 'aeiouAEIOU': conscon += 1 consonantratio.append(conscon/float(leng)) consletterratio.append(consletter/float(leng)) consdigitratio.append(consdig/float(leng)) model_mat = model_data['mat'] threshold = model_data['thresh'] if gib_detect_train.avg_transition_prob(str(line)[2:leng+2], model_mat) > threshold: pronounciation.append('1') else: pronounciation.append('0') csvfile2 = file('result_train_'+str(para)+'.csv', 'wb') if str(para) == 'black': x= 0 else: x = 1 writer = csv.writer(csvfile2) #writer.writerow(['domain', 'length', 'entrophy','Pronunciation','Vowel ratio','Digit ratio','Repeat letter', 'Consecutive digit ratio', 'Consecutive consonant ratio','N-gram score w','N-gram score d','Q']) i=0; while(i<len(name)): writer.writerow([ name[i], length[i], entropy[i],pronounciation[i], vowelratio[i],digitratio[i],consletterratio[i],consdigitratio[i],consonantratio[i],float(ngramwhite[i]/10000),float(ngramdict[i]/10000),x]) i=i+1;
def get_feature(domain): arr = domain.split('\n') domain = arr[0] lens = len(domain) separator = 0.0 bt_alpha = 0.0 max_alpha = 0.0 digit = 0.0 bt_digit = 0.0 max_digit = 0.0 special = 0.0 trans = 0.0 bt_separator = 0.0 bt = 0.0 flag = 0 upper = 0.0 hasip = 0.0 for i in range(lens): try: x = domain[i] #print x bt = bt + 1 if (bt_alpha > max_alpha): max_alpha = bt_alpha if (bt_digit > max_digit): max_digit = bt_digit if (x == '-'): bt_alpha = 0.0 bt_digit = 0.0 separator = separator + 1 if (bt - 1 > bt_separator and flag == 1): bt_separator = bt - 1 bt = 0.0 flag = 1 elif (x.isalpha()): bt_alpha = bt_alpha + 1 bt_digit = 0 elif (x.isdigit()): digit = digit + 1 bt_digit = bt_digit + 1 bt_alpha = 0.0 j = i + 1 while (j <= lens) and (domain[j].isdigit() or domain[j] == '.'): j = j + 1 if checkip(domain[i:j]): hasip = 1.0 elif (not (x == '.')): #print x bt_alpha = 0.0 bt_digit = 0.0 special = special + 1 else: bt_alpha = 0.0 bt_digit = 0.0 if (x.isupper()): upper = upper + 1 if ((i >= 1) and (not (x == '.'))): j = i - 1 while (domain[j] == '.'): j = j - 1 if ((x.isalpha() and domain[j].isdigit()) or (x.isdigit() and domain[j].isalpha())): trans = trans + 1 except: print 'URLError:' + domain f_len = float(len(domain)) count = Counter(i for i in domain).most_common() entropy = -sum(j / f_len * (math.log(j / f_len)) for i, j in count) model_data = pickle.load(open('gib_model.pki', 'rb')) model_mat = model_data['mat'] threshold = model_data['thresh'] gib_value = int( gib_detect_train.avg_transition_prob(domain, model_mat) > threshold) if (not lens == 0): rates = float(digit) / float(lens) trans_rates = float(trans) / float(lens) else: rates = 0.0 trans_rates = 0.0 return (float(lens), hasip, entropy, separator, special, digit, rates, trans_rates, upper, bt_separator, max_digit, max_alpha, float(gib_value))
#!/usr/bin/python import pickle import gib_detect_train model_data = pickle.load(open('gib_model.pki', 'rb')) model_mat = model_data['mat'] threshold = model_data['thresh'] print "threshold: ", threshold while True: l = raw_input() result = gib_detect_train.avg_transition_prob(l, model_mat) print(result) print result > threshold
def main(self): #初始化列表 self.src_ip_list = [] self.resultList_dga = [] self._list_ = [] self.resultList = [] #生成数据,提取源IP src_ip_aggs = self.es.search(index=self.index, body=self.body_aggs, request_timeout = 3600) src_ip_list_buckets = src_ip_aggs['aggregations']['group_by_src_ip']['buckets'] for _ip in src_ip_list_buckets: self.src_ip_list.append(_ip.get('key_as_string')) print ("srcip_num = ", len(self.src_ip_list)) #初始化ngram数据 n_gram_file = open('n_gram_rank_freq.txt','r') gram_rank_dict = dict() for i in n_gram_file: cat,gram,freq,rank = i.strip().split(',') gram_rank_dict[gram]=int(rank) n_gram_file.close() cc = 0 #生成数据,提取DNS查询事件 for src_ip in self.src_ip_list: print (src_ip) DNS_query = self.es_search(src_ip)['aggregations']['group_by_domain_name']['buckets'] init_DNS_query = [] domain_doc_count = [] tup_len_format_list = [] src_domain_list = [] for _dns in DNS_query: init_DNS_query.append(_dns.get('key')) domain_doc_count.append(_dns.get('doc_count')) init_DNS_query_count = len(init_DNS_query) cc = cc+init_DNS_query_count for _key in range(init_DNS_query_count): domain_name = init_DNS_query[_key].lower() #计算n_gram排名 bigram = [''.join(i) for i in self.bigrams(domain_name)] trigram = [''.join(i) for i in self.trigrams(domain_name)] #提取bigram bigram_rank = np.array([gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in self.bigrams(domain_name)]) #提取trigram trigram_rank = np.array([gram_rank_dict[''.join(i)] if ''.join(i) in gram_rank_dict else 0 for i in self.trigrams(domain_name)]) #随机性熵值 f_len = float(len(domain_name)) shannon_count = Counter(i for i in domain_name).most_common() entropy = -sum(j/f_len*(math.log(j/f_len)) for i,j in shannon_count) #元音字母可读性计算 model_mat = self.model_data['mat'] threshold = self.model_data["thresh"] git_bool = gib_detect_train.avg_transition_prob(domain_name, model_mat) > threshold #域名可信度判断 if entropy > 2.2 and git_bool == False and (self.ave(bigram_rank) > 250 or self.ave(trigram_rank) > 3000): #提取查询次数超过50次的域名 if domain_doc_count[_key] > 100: #域名的长度 _len = len(domain_name) #域名的格式 _format = self.format(domain_name) #域名长度和格式元组 _tup = (_len,_format) tup_len_format_list.append(_tup) tup_src_domain = (src_ip , domain_name) src_domain_list.append(tup_src_domain) test_dga = list(set(src_domain_list)) len_tup_len_format_list = len(tup_len_format_list) #提取长度和格式相同的域名 for x in tup_len_format_list: c = 0 initList_dga = [] y = 0 while y < len_tup_len_format_list: if x == tup_len_format_list[y]: c = c + 1 initList_dga.append(src_domain_list[y]) if c == 6: self.resultList_dga = self.resultList_dga + initList_dga break y = y + 1 #列表去重 self.resultList_dga = list(set(self.resultList_dga)) #8月3号更新,新增判断域名是否为拼音或拼音首字母组成,匹配则丢弃 for r_line in open(self.path+'sougou_db.txt','r'): r_line = r_line.rstrip("\n") self._list_.append(r_line) for _string in self.resultList_dga: self.pinyin_or_word(_string) print ('all=',cc) #输出结果 if len(self.resultList): print ("#######################") print ('DGA detected!') print ("#######################") print (self.resultList) print ("domain len is:" , len(self.resultList)) print ("#######################") else : print ("no DGA detected") #8月2号更新:whois查询 non_whois_count = 0 whois_count = 0 error_count = 0 for _init_whois in self.resultList: result = "" _whois_domain = _init_whois[1] try: result = whois.whois(_whois_domain)["domain_name"] except whois.parser.PywhoisError: print ("PywhoisError") error_count = error_count +1 except KeyError: print ("KeyError") error_count = error_count +1 except socket.timeout: print ("Timeout") error_count = error_count +1 except ConnectionResetError: print ("ConnectionResetError") error_count = error_count +1 except socket.gaierror: print ("socket.gaierror") error_count = error_count +1 if result == None or result == "": non_whois_count = non_whois_count +1 # self.resultList.append(_init_whois) else: whois_count = whois_count +1 print ("non-exist whois count is:", non_whois_count) print ("error count is:", error_count) print ("exist whois count is:", whois_count) print ("exist whois percent is:", whois_count/len(self.resultList))