class json2arff: def __init__(self): self.logger = NsLog("log") def convert_for_train(self, features, param): # arff convert header try: ArffStr = '''@relation weka-test\n\n''' features_keys_url = list(features[0]['url_features'].keys()) features_keys_active = [] if param == '-a': features_keys_active = list( features[0]['active_features'].keys()) for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" if param == '-a': for line in features_keys_active: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" ArffStr = ArffStr + '@attribute class {phish, legitimate}' + "\n\n@data\n" except: self.logger.debug("Hata - Json_to_arff e gelen sample sayısı" + str(len(features)) + "\nurl_feature_keys: " + str(features_keys_url) + "\nactive_features_key: " + str(features_keys_active)) self.logger.error("Error Arff Header : {0}".format(format_exc())) # header son for each_domain in features: try: tmp = "" for key in features_keys_url: tmp = tmp + str(each_domain['url_features'][key]) + "," if param == '-a': for key_a in features_keys_active: tmp = tmp + str( each_domain['active_features'][key_a]) + "," tmp = tmp + each_domain['info']['class'] + "\n" ArffStr = ArffStr + tmp except: self.logger.debug("Arffe çevrilen sample da hata :\n" + str(each_domain)) self.logger.error("Error Arff Body : {0}".format(format_exc())) return ArffStr def convert_for_test(self, features, param): #todo active rules a göre güncellenecek # arff convert header ArffStr = '''@relation weka-test\n\n''' features_keys_url = features[0]['url_features'].keys() if param == '-dns': features_keys_dns = features[0]['dns_features'].keys() for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" if param == '-dns': for key in features_keys_dns: ArffStr = ArffStr + '@attribute ' + key + " numeric\n" ArffStr = ArffStr + "\n@data\n" # header son for each_domain in features: tmp = "" for key in features_keys_url: tmp = tmp + str(each_domain['url_features'][key]) + "," if param == '-dns': for key_dns in features_keys_dns: tmp = tmp + str(each_domain['dns_features'][key_dns]) + "," tmp = tmp[0:len(tmp) - 1] + "\n" ArffStr = ArffStr + tmp return ArffStr def convert_for_NLP_without_features(self, features): # arff convert header try: ArffStr = '''@relation weka-test\n\n''' ArffStr += '@attribute words string\n' ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n" for sample in features: ArffStr += "'" for word in sample['info']['nlp_info']['words_nlp']: ArffStr += word + " " ArffStr = ArffStr.strip() + "',{0}\n".format( sample['info']['class']) except: self.logger.error("Error Arff Header : {0}".format(format_exc())) return ArffStr def convert_for_NLP_with_features(self, features): # arff convert header try: features_keys_url = list(features[0]['url_features'].keys()) ArffStr = '''@relation weka-test\n\n''' ArffStr += '@attribute words string\n' for line in features_keys_url: ArffStr = ArffStr + '@attribute ' + line + " numeric\n" ArffStr += '@attribute class {phish, legitimate}' + "\n\n@data\n" for sample in features: ArffStr += '"' for word in sample['info']['nlp_info']['words_nlp']: ArffStr += word + " " ArffStr = ArffStr.strip() + '",' for key in features_keys_url: ArffStr += str(sample['url_features'][key]) + "," ArffStr = ArffStr.strip() + '{0}\n'.format( sample['info']['class']) except: self.logger.error("Error Arff Header : {0}".format(format_exc())) return ArffStr
class url_rules: def __init__(self): print("initializing") self.logger = NsLog("log") self.path_data = "data/" self.name_brand_file = "allbrand.txt" self.path_alexa_files = "../data/alexa-tld/" self.nlp_manager = nlp_class() self.pp = pprint.PrettyPrinter(indent=4) self.word_splitter = WordSplitterClass() allbrand_txt = open(self.path_data + self.name_brand_file, "r") self.allbrand = self.__txt_to_list(allbrand_txt) # trie #self.trie_alexa_tld = pygtrie.CharTrie(json.loads(open("constant/alexa_tld_json.txt", "r").read())) #self.trie_alexa_tldsiz = pygtrie.CharTrie(json.loads(open("constant/alexa_tldsiz_dct.json", "r").read())) def __txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list def rules_main(self, domain, tld, subdomain, path, words_raw): features = {} info_nlp = {} try: features.update(self.digit_count(domain, subdomain, path)) # digitcount features.update(self.length(domain, subdomain, path)) # uzunluk features.update(self.tld_check(tld)) # tld check features.update(self.check_rule_5(words_raw)) # www-com features.update(self.punny_code(domain)) # punnycode features.update(self.random_domain(domain)) # random_domain features.update(self.subdomain_count(subdomain)) # subdomain count features.update(self.char_repeat(words_raw)) # char_repeat # features.update(self.alexa_check(domain, tld)) # alexa1m check #features.update(self.alexa_trie(domain, tld)) # alexa1m check trie features.update(self.special_chars(domain, subdomain, path)) # - . / @ features.update(self.check_domain_in_list(domain)) result_nlp = self.nlp_features(words_raw) features.update(result_nlp['features']) # words_info info_nlp = result_nlp['info'] except: self.logger.error("url_rules.main() Error : {0}".format( format_exc())) return info_nlp, features def digit_count(self, domain, subdomain, path): result = { 'domain_digit_count': 0, 'subdomain_digit_count': 0, 'path_digit_count': 0 } for letter in domain: if letter.isdigit(): result['domain_digit_count'] = result['domain_digit_count'] + 1 for letter in subdomain: if letter.isdigit(): result['subdomain_digit_count'] = result[ 'subdomain_digit_count'] + 1 for letter in path: if letter.isdigit(): result['path_digit_count'] = result['path_digit_count'] + 1 return result def length(self, domain, subdomain, path): domain_uzunluk = len(domain) subdomain_uzunluk = len(subdomain) path_uzunluk = len(path) result = {} result['domain_length'] = domain_uzunluk result['subdomain_length'] = subdomain_uzunluk result['path_length'] = path_uzunluk return result def tld_check(self, tld): common_tld = ["com", "org", "net", "de", "edu", "gov"] result = {} if tld in common_tld: result["isKnownTld"] = 1 else: result["isKnownTld"] = 0 return result def check_rule_5(self, words_raw): result = {'www': 0, "com": 0} for word in words_raw: if not word.find('www') == -1: result['www'] = result['www'] + 1 if not word.find('com') == -1: result['com'] = result['com'] + 1 return result def punny_code(self, line): result = {} if line.startswith("xn--"): result['punnyCode'] = 1 return result else: result['punnyCode'] = 0 return result def random_domain(self, domain): result = {'random_domain': self.nlp_manager.check_word_random(domain)} return result def subdomain_count(self, line): sub = line.split(".") result = {} result['subDomainCount'] = len(sub) return result def __all_same(self, items): return all(x == items[0] for x in items) def char_repeat(self, words_raw): result = {'char_repeat': 0} repeat = {'2': 0, '3': 0, '4': 0, '5': 0} part = [2, 3, 4, 5] "sliding window mantigi repeat sayisi kadar eleman al" \ "hepsi ayni mi diye bak - ayni ise artir" for word in words_raw: for char_repeat_count in part: for i in range(len(word) - char_repeat_count + 1): sub_word = word[i:i + char_repeat_count] if self.__all_same(sub_word): repeat[str(char_repeat_count )] = repeat[str(char_repeat_count)] + 1 result['char_repeat'] = sum(list(repeat.values())) return result def alexa_check(self, domain, tld): is_find_tld = 0 is_find = 0 line = domain + "." + tld letter = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "r", "s", "t", "u", "v", "y", "z", "w", "x", "q", "1", "2", "3", "4", "5", "6", "7", "8", "9", "0" ] try: if line[0] in letter: alexa_txt = open( "{0}{1}.txt".format(self.path_alexa_files, line[0]), "r") alexaList_tld = [] #tldli alexa_list = [] #tldsiz for alexa_line in alexa_txt: alexaList_tld.append(alexa_line.strip()) alexa_list.append(alexa_line.strip().split(".")[0]) alexa_txt.close() for alexa_line in alexaList_tld: if line.strip() == alexa_line.strip(): is_find_tld = 1 break for alexa_line in alexa_list: line_domain = line.split(".")[0] if line_domain.strip() == alexa_line.strip(): is_find = 1 break except: self.logger.debug(line + "işlenirken hata uzunluktan dolayı") self.logger.error( "url_rules.check_rule_11()-Alexa / Error : {0}".format( format_exc())) result = {} if is_find_tld == 1: result['alexa1m_tld'] = 1 else: result['alexa1m_tld'] = 0 if is_find == 1: result['alexa1m'] = 1 else: result['alexa1m'] = 0 return result def alexa_trie(self, domain, tld): line = domain + "." + tld result = {} try: #if self.alexa1mm[line[0].lower()].has_key(line): if self.trie_alexa_tld.has_key(line): result['alexa1m_tld_trie'] = 1 else: result['alexa1m_tld_trie'] = 0 if self.trie_alexa_tldsiz.has_key(domain): result['alexa1m_tldsiz_trie'] = 1 else: result['alexa1m_tldsiz_trie'] = 0 except: self.logger.debug(line + "işlenirken alexa") self.logger.error( "url_rules.check_rule_11()-Alexa / Error : {0}".format( format_exc())) return result def special_chars(self, domain, subdomain, path): special_char = { '-': 0, ".": 0, "/": 0, '@': 0, '?': 0, '&': 0, '=': 0, "_": 0 } special_char_letter = special_char.keys() for l in domain: if l in special_char_letter: special_char[l] = special_char[l] + 1 for l in subdomain: if l in special_char_letter: special_char[l] = special_char[l] + 1 for l in path: if l in special_char_letter: special_char[l] = special_char[l] + 1 return special_char def check_domain_in_list(self, domain): result = {} if domain in self.allbrand: result['domain_in_brand_list'] = 1 else: result['domain_in_brand_list'] = 0 return result def nlp_features(self, words_raw): """ keywords_in_words, brands_in_words, dga_in_words, len_lt_7, len_gt_7 """ grouped_words = self.nlp_manager.parse(words_raw) splitted_words = self.word_splitter._splitl(grouped_words['len_gt_7']) """ found_keywords, found_brands, similar_to_keyword, similar_to_brand, other_words, target_words """ fraud_analyze_result = self.nlp_manager.fraud_analysis( grouped_words, splitted_words) result = self.nlp_manager.evaluate(grouped_words, fraud_analyze_result, splitted_words) split = {'raw': grouped_words['len_gt_7'], 'splitted': splitted_words} result['info']['compoun_words'] = split return result
class machine_learning_algorithm(): def __init__(self, algorithm, train_data_name="gsb.arff"): self.logger = NsLog("log") self.path_output_arff = "../output/arff/" self.path_test_output = "" self.json2arff_object = json2arff() self.parser_object = domain_parser() self.train_data_name = train_data_name self.rule_calculation = rule_extraction() self.train, self.test, self.train_label, self.test_label = self.split_test_and_train_data() self.time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") if algorithm == 'NB': self.model = self.create_model_NB() self.active_model = self.active_create_model_NB() elif algorithm == 'RF': self.model = self.create_model_RF() self.active_model = self.active_create_model_RF() elif algorithm == 'GB': self.model = self.create_model_GB() self.active_model = self.active_create_model_GB() def txt_to_list(self, txt_object): lst = [] for line in txt_object: lst.append(line.strip()) txt_object.close() return lst def split_test_and_train_data(self): try: data , label = self.preparing_train_data() train,test,train_label,test_label = train_test_split(data,label,test_size=0.25,random_state=42) except: self.logger.debug(file_name+" ile train ve test ayırma sırasında hata") self.logger.error("Error : {0}".format(format_exc())) return train,test,train_label,test_label def preparing_train_data(self, file_name="gsb.arff"): train = [] target = [] try: train_dataset, train_meta = arff.loadarff(open("{0}{1}".format(self.path_output_arff, file_name), "r")) train = train_dataset[train_meta.names()[:-1]] # everything but the last column target = train_dataset[train_meta.names()[len(train_meta.names()) - 1]] # last column train = np.asarray(train.tolist(), dtype=np.float32) # olay burda except: self.logger.debug(file_name+" ile eğitim sırasında hata") self.logger.error("Error : {0}".format(format_exc())) return train, target def preparing_test_data(self, test_dataset_list): try: feat_json = open("../output/test-output/json-"+self.time_now+".txt", "w") feat_arff = open("../output/test-output/arff-"+self.time_now+".arff", "w") "domain_parsed to json without class" self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples(test_dataset_list) "rule calculation for test samples without class information -- output json format" test_features = self.rule_calculation.extraction(self.test_parsed_domains) "test sampleları için oluşturulan json -> arff e dönüştür. Class yok." arff_test_str = self.json2arff_object.convert_for_test(test_features, '') feat_json.write(json.dumps(test_features)) feat_arff.write(arff_test_str) feat_arff.close() feat_json.close() arff_raw = StringIO(arff_test_str) test_dataset, test_meta = arff.loadarff(arff_raw) test = test_dataset[test_meta.names()] test = np.asarray(test.tolist(), dtype=np.float32) except: self.logger.error("Test verisi ayarlanırken hata / Error : {0}".format(format_exc())) return test, self.test_parsed_domains def create_model_NB(self): #train, target = self.preparing_train_data() train, target = self.train, self.train_label#self.preparing_train_data() gnb = GaussianNB() #model = gnb.fit(train, target) model = gnb.fit(self.train, self.train_label) return model def create_model_RF(self): #train, target = self.preparing_train_data() train, target = self.train, self.train_label#self.preparing_train_data() clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0) #model = clf.fit(train, target) model = clf.fit(self.train, self.train_label) return model def create_model_GB(self): train, target = self.train, self.train_label#self.preparing_train_data() clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0) #clf = GradientBoostingClassifier(n_estimators=10, max_depth=7, random_state=0, verbose=0, max_features=50, min_samples_split=50, min_samples_leaf=50) model = clf.fit(self.train, self.train_label) return model def active_create_model_NB(self): train, target = self.preparing_train_data() gnb = GaussianNB() model = gnb.fit(train, target) return model def active_create_model_RF(self): train, target = self.preparing_train_data() clf = GradientBoostingClassifier(n_estimators=10, max_depth=7,random_state=0, verbose=0) model = clf.fit(train, target) return model def active_create_model_GB(self): #train, target = self.preparing_train_data() train, target = self.train, self.train_label#self.preparing_train_data() clf = GradientBoostingClassifier(n_estimators=10, max_depth=7, random_state=0, verbose=0) model = clf.fit(train, target) return model def model_run(self, test): model = self.model model_pre = model.predict(test) model_probability = model.predict_proba(test) model_pre_list = [] for p in model_pre: model_pre_list.append(str(p).replace("b'", "").replace("'", "")) model_probability = model_probability.tolist() return model_pre_list, model_probability def active_model_run(self, test): model = self.active_model model_pre = model.predict(test) model_probability = model.predict_proba(test) model_pre_list = [] for p in model_pre: model_pre_list.append(str(p).replace("b'", "").replace("'", "")) model_probability = model_probability.tolist() return model_pre_list, model_probability def output(self, test_data): test= self.test #self.test_parsed_domains = self.parser_object.parse_nonlabeled_samples(test_data) model_pre, model_probability = self.model_run(test) #test_parsed_domain = self.test_parsed_domains result_list = [] for test_domain in range(len(self.test)):#test_parsed_domain: result = {} result['id'] = test_domain result['class'] = model_pre[test_domain] result_list.append(result) test_result = open("../output/test-output/result-"+self.time_now+".txt", "w") test_result.write(json.dumps(result_list)) test_result.close() return result_list def active_output(self, test_data): test, test_parsed_domains = self.preparing_test_data(test_data) model_pre, model_probability = self.active_model_run(test) test_parsed_domain = self.test_parsed_domains result_list = [] #print(model_pre, model_probability) for test_domain in test_parsed_domain: result = {} result['domain'] = test_domain['url'] result['id'] = test_domain['id'] result['predicted_class'] = model_pre[test_domain['id']] result['probability_phish'] = (model_probability[test_domain['id']][1] / sum(model_probability[test_domain['id']])) * 100 result['probability_legitimate'] = (model_probability[test_domain['id']][0] / sum(model_probability[test_domain['id']])) * 100 result_list.append(result) test_result = open("../output/test-output/result-"+self.time_now+".txt", "w") test_result.write(json.dumps(result_list)) test_result.close() return result_list def accuracy(self): model = self.model test_data, test_label = self.preparing_train_data() scores = cross_val_score(model, test_data, test_label, cv=10) return scores def test_accuracy(self): test_file = self.txt_to_list( (open("../output/features/dp29114.txt",'r'))) self.result_list = self.output(test_file) count = 0 print("Starting...\n") for i in range(len(self.test)): if self.test_label[i].decode('utf-8') == self.result_list[i]['class']: count = count + 1 acc = ( count / len(self.test) ) * 100 return acc,count def confusion_matrix(self, name): """ train dataseti gsb.arff model içerisinde bu dataset var. confisioun matris çıkarmayı istediğimiz datayı preparing_train_data fonksiyonu ile arff formatı okunur. okunan dosya data ve label olarak bölünür. data model üzerinde çalıştırılır. elde edilen tahmin sonuçlarına ilişkin labellar model_preye atılır. test_label--bytes array formatında unicode formatına dönüştürülür ardından confusion matrix çalıştırılır. :param name: :return: """ test, test_label = self.preparing_train_data(file_name=name) model_pre, model_pro = self.model_run(test) test_label_unicode = [] for t in test_label: test_label_unicode.append(str(t, 'utf-8')) active = confusion_matrix(test_label_unicode, model_pre, labels=['phish', 'legitimate']) test = self.train test_label = self.train_label model_pre, model_pro = self.model_run(test) test_label_unicode = [] for t in test_label: test_label_unicode.append(str(t, 'utf-8')) return confusion_matrix(test_label_unicode, model_pre, labels=['phish', 'legitimate']), active
class nlp_class: def __init__(self): self.logger = NsLog("log") self.path_data = "../data/" self.name_keywords = "keywords.txt" self.name_brand_file = "allbrand.txt" self.name_random_model = "gib_model.pki" model_data = pickle.load(open(self.name_random_model, 'rb')) self.model_mat = model_data['mat'] self.threshold = model_data['thresh'] self.allbrand = self.__txt_to_list( open("{0}{1}".format(self.path_data, self.name_brand_file), "r")) self.keywords = self.__txt_to_list( open("{0}{1}".format(self.path_data, self.name_keywords), "r")) def __txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list def __is_similar_to_any_element(self, word, list): target = '' for l in list: if editdistance.eval(word, l) < 2: target = l if len(target) > 0: return word else: return 0 def parse(self, words): keywords_in_words = [] brands_in_words = [] similar_to_brands = [] similar_to_keywords = [] dga_in_words = [] len_gt_7 = [] len_lt_7 = [] try: for word in words: word = re.sub("\d+", "", word) if word in self.keywords: keywords_in_words.append(word) elif word in self.allbrand: brands_in_words.append(word) elif self.__is_similar_to_any_element(word, self.allbrand) != 0: target = self.__is_similar_to_any_element( word, self.allbrand) similar_to_brands.append(target) elif self.__is_similar_to_any_element(word, self.keywords) != 0: target = self.__is_similar_to_any_element( word, self.keywords) similar_to_keywords.append(target) elif len(word) > 3 and not word.isnumeric(): if (gib_detect_train.avg_transition_prob( word, self.model_mat) > self.threshold) == False: dga_in_words.append(word) # todo keyword benzeri olanlar temizlenmeli elif len(word) < 7: len_lt_7.append(word) else: len_gt_7.append(word) result = { 'keywords_in_words': keywords_in_words, 'brands_in_words': brands_in_words, 'dga_in_words': dga_in_words, 'len_lt_7': len_lt_7, 'len_gt_7': len_gt_7, 'similar_to_brands': similar_to_brands, 'similar_to_keywords': similar_to_keywords } except: self.logger.debug(str(words) + " işlenirken hata") self.logger.error("Error : {0}".format(format_exc())) return result def fraud_analysis(self, grouped_words, splitted_words): word_list = grouped_words['len_lt_7'] + grouped_words[ 'similar_to_brands'] + grouped_words[ 'similar_to_keywords'] + splitted_words word_list_nlp = grouped_words['len_lt_7'] + grouped_words['similar_to_brands'] + \ grouped_words['similar_to_keywords'] + grouped_words['brands_in_words'] + \ grouped_words['keywords_in_words'] + grouped_words['dga_in_words'] + splitted_words found_keywords = [] found_brands = [] similar_to_keyword = [] similar_to_brand = [] other_words = [] target_words = {'brand': [], 'keyword': []} try: for word in word_list: word = re.sub("\d+", "", word) if word in self.keywords: found_keywords.append(word) elif word in self.allbrand: found_brands.append(word) else: for brand in self.allbrand: if editdistance.eval(word, brand) < 2: target_words['brand'].append(brand) similar_to_brand.append(word) for keyword in self.keywords: if editdistance.eval(word, keyword) < 2: target_words['keyword'].append(keyword) similar_to_keyword.append(word) if word not in found_keywords + found_brands + similar_to_keyword + similar_to_brand: other_words.append(word) result = { 'found_keywords': found_keywords, 'found_brands': found_brands, 'similar_to_keywords': similar_to_keyword, 'similar_to_brands': similar_to_brand, 'other_words': other_words, 'target_words': target_words, 'words_nlp': word_list_nlp } except: self.logger.debug(str(word_list) + " işlenirken hata") self.logger.error("Error : {0}".format(format_exc())) return result def evaluate(self, grouped_words, fraud_analyze_result, splitted_words): """ grouped_words keywords_in_words, brands_in_words, dga_in_words, len_lt_7, len_gt_7 fraud_anaylze_result found_keywords, found_brands, similar_to_keyword, similar_to_brand, other_words, target_words """ try: words_raw = grouped_words['keywords_in_words'] + grouped_words['brands_in_words'] + \ grouped_words['similar_to_brands'] + grouped_words['similar_to_keywords'] + \ grouped_words['dga_in_words'] + grouped_words['len_lt_7'] + grouped_words['len_gt_7'] words_len = [] compound_word_len = [] for word in words_raw: words_len.append(len(word)) for word in grouped_words['len_gt_7']: compound_word_len.append(len(word)) all_keywords = grouped_words[ 'keywords_in_words'] + fraud_analyze_result['found_keywords'] all_brands = grouped_words[ 'brands_in_words'] + fraud_analyze_result['found_brands'] similar_brands = fraud_analyze_result['similar_to_brands'] similar_keywords = fraud_analyze_result['similar_to_keywords'] if len(compound_word_len) == 0: av_com = 0 else: av_com = float(np.average(compound_word_len)) if len(words_len) == 0: min = 0 max = 0 av_w = 0 std = 0 else: min = int(np.min(words_len)) max = int(np.max(words_len)) av_w = float(np.average(words_len)) std = float(np.std(words_len)) result = { 'info': { 'keywords': all_keywords, 'brands': all_brands, 'dga_in_words': grouped_words['dga_in_words'], 'similar_to_keywords': similar_keywords, 'similar_to_brands': similar_brands, 'negligible_words': fraud_analyze_result['other_words'], 'target_words': fraud_analyze_result['target_words'], 'words_nlp': fraud_analyze_result['words_nlp'] }, 'features': { 'raw_word_count': len(words_len), 'splitted_word_count': len(splitted_words), 'average_word_length': av_w, 'longest_word_length': max, 'shortest_word_length': min, 'std_word_length': std, 'compound_word_count': len(grouped_words['len_gt_7']), 'keyword_count': len(all_keywords), 'brand_name_count': len(all_brands), 'negligible_word_count': len(fraud_analyze_result['other_words']), 'target_brand_count': len(fraud_analyze_result['target_words']['brand']), 'target_keyword_count': len(fraud_analyze_result['target_words']['keyword']), 'similar_keyword_count': len(similar_keywords), 'similar_brand_count': len(similar_brands), 'average_compound_words': av_com, 'random_words': len(grouped_words['dga_in_words']) } } except: self.logger.error("Error : {0}".format(format_exc())) return result def check_word_random(self, word): if gib_detect_train.avg_transition_prob( word, self.model_mat) < self.threshold: return 1 else: return 0
class Train: def __init__(self): self.logger = NsLog("log") self.json2arff_object = json2arff() self.parser_object = domain_parser() self.rule_calculation = rule_extraction() self.path_input = "../input/" self.path_arff = "../output/arff/" self.path_features = "../output/features/" self.path_parsed_domain = "../output/domain_parser/" def txt_to_list(self, txt_object): lst = [] for line in txt_object: lst.append(line.strip()) txt_object.close() return lst def domain_parser(self, param): parsed_domains = [] for i in range(1, len(param), 2): try: if param[i + 1] == 'phish' or param[i + 1] == 'legitimate': #dataset = self.txt_to_list(open("{0}{1}".format(self.path_input, param[i]), "r")) # txt read dataset = json.loads( open("{0}{1}".format(self.path_input, param[i]), "r").read()) # json read parsed_domains = parsed_domains + self.parser_object.parse( dataset, param[i + 1], len(parsed_domains)) else: self.logger.debug( "class labels must be entered one of (phish, legitimate)" ) except: self.logger.error("an error is occurred : {0}".format( format_exc())) self.logger.debug( "an error occurred when | {0}.txt | file was processing". format(param)) self.logger.info( "Domain Parse process is done {0} unique URLs are parsed".format( len(parsed_domains))) return parsed_domains def json_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(json.dumps(data)) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name)) def arff_to_file(self, name, path, data): time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") file_name = name + "_" + time_now + ".txt" file = open(path + file_name, "w") file.write(data) file.close() self.logger.info("{0} Dosyaya Yazıldı.".format(name))
class active_rules: def __init__(self): self.pp = pprint.PrettyPrinter(indent=2) self.logger = NsLog("log") def goog_safe_browsing(self, domain_features): try: url_list = [] updated_domain_features = domain_features for sample in domain_features: url_list.append(sample['info']['url']) sep_list = self.__seperate(url_list, 500) #phishing_url_list = self.get_urls(self.google_sb_query(list, sep_list.index(list), len(domain_features))) # aktif phishing_url_list = self.get_urls(json.loads(open("constant/gb_phish.json", "r").read())) #dosyadan updated_domain_features = [] for each in domain_features: element = each if each['info']['url'] in phishing_url_list: element.update({'active_features': {'google_safe_browsing': 1}}) else: element.update({'active_features': {'google_safe_browsing': 0}}) updated_domain_features.append(element) except: self.logger.error("Error : {0}".format(format_exc())) return updated_domain_features def __seperate(self, url_list, size): sep_urls = [] k = int((len(url_list)/size)+1) for i in range(1, k+1): if (i*size) > len(url_list): sep_urls.append(url_list[(i - 1) * size : len(url_list)]) else: sep_urls.append(url_list[(i-1)*size: i*size]) return sep_urls def google_sb_query(self, url_list, count, overall_count): query_url_list = self.sb_format(url_list) sep_list = self.__seperate(query_url_list, 500) for list in sep_list: time_now = str(datetime.datetime.now())[0:19].replace(" ", "_") api_key = 'AIzaSyCGmGpCMt-PNQTrWAsp3LqcM_UvCF6NJ1I' url = "https://safebrowsing.googleapis.com/v4/threatMatches:find" payload = {'client': {'clientId': "mycompany", 'clientVersion': "0.1"}, 'threatInfo': {'threatTypes': ["SOCIAL_ENGINEERING", "MALWARE"], 'platformTypes': ["ANY_PLATFORM"], 'threatEntryTypes': ["URL"], 'threatEntries': list }} params = {'key': api_key} r = requests.post(url, params=params, json=payload).json() phish_url_list = [] if 'matches' in r.keys(): for each in r['matches']: phish_url_list.append(each['threat']['url']) elif 'error' in r.keys(): self.logger.debug("Google-SB sorgusunda hata - Toplam işlenen örnek sayısı: "+overall_count+"\nişlenen parça (500): "+count) return phish_url_list def sb_format(self, url_list): sb_query = [] for url in url_list: sb_query.append({'url': url}) return sb_query def get_urls(self, ph_db_json): urls = [] for obj in ph_db_json: urls.append(obj['url']) return urls def txt_to_list(self, txt_object): list = [] for line in txt_object: list.append(line.strip()) txt_object.close() return list