def get_non_dga_domains(): query_body = { "query": { "bool": { "must_not": [{ "query_string": { "default_field": "info.Desc", "query": "DGA" } }] } } } bad_domains = set_mal_domain_index_params(query_body) print("len of bad_domains: %s" % len(bad_domains)) file = FULL_DOM_DIR + "es_non_dga.txt" write2file(file, bad_domains) # non dga域名及其来源和类型写入文件中 file2 = PRE_DIR + "es_non_dga_with_type.txt" domain_dict = set_mal_domain_index_params1(get_domains_with_type, query_body) for domain, info_tuple in domain_dict.items(): print("domain:%s, source: %s, mal_type: %s" % (domain, info_tuple[0], info_tuple[1])) write_domain_with_type_2file(file2, domain_dict)
def test_mal_domains(db, domain_bad, recs): # query_body = {"ver_mal_sub_domains": {"$exists": False}} mongo_index = mongo_index_dict[domain_bad] notmal_count, iter = 0, 0 not_mal_domains = [] for iter, domain_dict in enumerate(recs): domain_2nd = domain_dict[DOMAIN_2ND_FIELD] sub_domains = domain_dict[SUBDOMAINS_FIELD] ver_sub_domains = domain_dict.get(VER_SUBDOMAINS_FIELD, []) print("handlering %s domain %s" % (iter, domain_2nd)) if scan_url(domain_2nd): sub_domains = list(set(sub_domains) - set(ver_sub_domains)) for sub_domain in sub_domains: # 如果三级子域名和二级域名相同,则不必检测,直接认定该三级子域名相同。 if sub_domain == domain_2nd: continue if not scan_url(sub_domain): sub_domains.remove(sub_domain) print("domain_2nd: %s, sub_domain: %s" % (domain_2nd, sub_domain)) save_mal_domains2mongodb(db, mongo_index, domain_2nd, sub_domains) else: # print("delete_not_mal_domain: %s" % (domain_2nd,)) # delete_not_mal_domain(domain_2nd, db, mongo_index) # 误判的恶意域名,不能直接删除,有时会将恶意域名看做是正常的 not_mal_domains.append(domain_2nd) notmal_count += 1 if notmal_count: print("notmal_count: %s" % (notmal_count, )) # 将非恶意域名写入文件中,后面删除 write2file(NOT_MAL_DOM_FILE, not_mal_domains)
def check_domains(domains, domain_bad, batch_num=50): i = 0 domain_info_dict_list = [] longest_substring_list = set() for domain in domains: i += 1 domain_2nd = keep_2nd_dom_name(domain) domain_len = len(domain_2nd) n_digits, digit_segs, word_segs = word_segment(domain_2nd) digit_number_ratio = n_digits / len(domain) n_groups_of_digits = len(digit_segs) # 整个二级域名字符串可以被多少组数字分隔开 n_group_of_word_segs = len( word_segs) # 整个二级域名中字符串最为被分为了多少组如w3cschool最后被分为三组:w, c,school longest_len, longest_substring = get_longest_meaningful_substring_v0( word_segs) # 最长有意义字符串长度,最长有意义子串 domain_name_entropy = cal_domain_name_entropy(domain) longest_substring_list.add(longest_substring) print('==============================================================') print('domain: {0}, domain_2nd: {1}, digit_segs: {2}, word_segs:{3}'. format(domain, domain_2nd, digit_segs, word_segs)) print( 'domain_2nd: {0}, n_digits: {1}, n_groups_digits: {2}, n_group_word_segs: {3}' .format(domain_2nd, n_digits, n_groups_of_digits, n_group_of_word_segs)) print('domain_2nd: {0}, longest_len:{1},longest_substring: {2}'.format( domain_2nd, longest_len, longest_substring)) domain_info = { DOMAIN_2ND_FIELD: domain, DOMAIN_LEN: domain_len, DOMAIN_NAME_ENTROPY: domain_name_entropy, N_DIGITS: n_digits, DIGIT_NUMBER_RATIO: digit_number_ratio, N_GROUPS_OF_DIGITS: n_groups_of_digits, WORD_SEG_GROUP: n_group_of_word_segs, LONGEST_SUBSTRING_RATIO: longest_len / domain_len # 最长有意义子串占整个字符串的比例 } domain_info_dict_list.append(domain_info) if i % batch_num == 0 or i == len(domains): print('第{0}个域名正在统计'.format(i)) print("==========domain_info==============") columns_fields = [ DOMAIN_2ND_FIELD, DOMAIN_LEN, DOMAIN_NAME_ENTROPY, N_DIGITS, DIGIT_NUMBER_RATIO, N_GROUPS_OF_DIGITS, WORD_SEG_GROUP, LONGEST_SUBSTRING_RATIO ] domain_name_file = str(domain_bad) + "_" + DOMAIN_NAME_FEATURE_FILE write2csv(domain_info_dict_list, columns_fields, domain_name_file, DOMAIN_2ND_FIELD) longest_substring_file = str(domain_bad) + "_" + LONGEST_SUBSTRING_FILE remove_file(longest_substring_file) write2file(longest_substring_file, longest_substring_list)
def remove_duplicate_from_file(file): domain_set = set() with open(file) as f_out: lines = f_out.readlines() for line in lines: domain = line.strip("\n") domain_set.add(domain) remove_file(file) write2file(file, domain_set) print("%s unique bad domains" % (len(domain_set)))
def test_domains(file, dst_file, choice=2): """ 测试恶意域名是否真的是恶意的,并将恶意域名写入dst_file指定的文件内 :param file: 源文件,保存等待验证的域名 :param dst_file: 恶意域名写入的文件 :param choice: 2表示二级域名,3表示3级域名 :return: """ print("file: %s, dst_file: %s" % (file, dst_file)) bad_domains = [] i = 0 batch_num = 5 # 批处理写入到文件dst_file中的数量 try: with open(file, "r") as f_out: lines = f_out.readlines() if os.path.exists(dst_file): v_last_line = find_last_checked_lines(dst_file) # print("lines[235]: %s" % lines[235]) pos1 = lines.index(v_last_line) if pos1 < len(lines): lines = lines[pos1 + 1:] print("there is %s left to be handled" % (len(lines), )) for line in lines: print("==============================================") start_time = time.time() if len(bad_domains) >= batch_num: print("bad_domains write to file") write2file(dst_file, bad_domains) bad_domains = [] domain = line.strip("\n") bad_flag = scan_url(domain) if bad_flag: print("add bad_domain: %s" % domain) bad_domains.append(domain) if i & 1: random_num = random.randint(5, 15) else: random_num = random.randint(10, 20) i = 1 - i time.sleep(random_num) end_time = time.time() cost_time = end_time - start_time print("handle: %s,bad_flag: %s, cost_time: %s" % (domain, bad_flag, cost_time)) except Exception as e: print("error: %s" % e) finally: print("totally %s domains are bad!" % len(bad_domains)) if bad_domains: write2file(dst_file, bad_domains)
def read_niclog_url_files(file_list, mal_domain_set): for file in file_list: start_time = time.time() unknown_domain_set = read_niclog_url_file(file) end_time = time.time() cost_time = end_time - start_time print("==================================================================") print("%s domains, size: %s Kbytes" % (len(unknown_domain_set), sys.getsizeof(unknown_domain_set) / 1024)) print("cost_time: %s 秒" % (cost_time)) insect_domains = unknown_domain_set & mal_domain_set print("%s bad domains found in file %s" % (len(insect_domains), file)) write2file(BAD_URL_DOMAINS_FILE, insect_domains)
def get_good_niclog_domain(): """ 将在niclog中访问过的正常域名提取出来 :return: """ recs = db_basic[DOMAIN_BASIC_COL].find() domain_set = set() for rec in recs: domain = rec[DOMAIN_2ND_FIELD] domain_set.add(domain) print("len of domain_set: %s" % (len(domain_set))) write2file("good_niclog_url.txt", domain_set)
def read_file_list(dir, choice): """ :param dir: 从指定目录下读取一些文件,这些文件中都是不同类型的恶意域名(全限定域名),将这些文件中的域名转换成 两级或者三级域名后重新写入新的文件 :param choice:choice=2表示保留2级域名,3表示保留3级域名 :return: """ files = os.listdir(dir) count = 0 for file in files: domains_set = set() file_dir = FULL_DOM_DIR + file # print(file) domains = read_file(file_dir, choice) domains_set = domains_set | domains count += len(domains_set) file_prefix = file.split(".")[0] file = UVER_DOM_DIR + file_prefix + "_" + str(choice) + ".txt" remove_file(file) write2file(file, domains_set) print("write to file:%s" % file) print("totally %s domains converted to %s level domain" % (count, choice))
def get_non_dga_domains(): query_body = {"query": {"bool": {"must_not": [{"query_string": {"default_field": "info.Desc", "query": "DGA"}}]}}} bad_domains = set_mal_domain_index_params(query_body) print("len of bad_domains: %s" % len(bad_domains)) file = FULL_DOM_DIR + "es_non_dga.txt" write2file(file, bad_domains)
from common.mongodb_op import mongo_url from common.mongodb_op import MAL_DOMS_MONGO_DB, MAL_DOMAINS_MONGO_INDEX from common.mongo_common import DOMAIN_2ND_FIELD, MAL_TYPE, SOURCE_SIET from common.domains_op import write2file from get_visited_bad_domains_info.get_mal_domains_from_niclog import OLD_141_BAD_DOMAINS_FILE client = MongoClient(mongo_url) def show_visited_bad_domains(domains): """ 显示从niclog中能够匹配的141恶意域名及其来源、恶意类型等信息 :return: """ db = client[MAL_DOMS_MONGO_DB] mongo_index = MAL_DOMAINS_MONGO_INDEX for domain_2nd in domains: query_body = {DOMAIN_2ND_FIELD: domain_2nd} recs = db[mongo_index].find(query_body) mal_type = recs[0].get(MAL_TYPE, "unknown") source = recs[0].get(SOURCE_SIET, "unknown") print("domain %s captured, source: %s, type: %s" % (domain_2nd, source, mal_type)) if __name__ == '__main__': domain_bad = 1 domains = get_visited_domains(domain_bad) show_visited_bad_domains(domains) write2file(OLD_141_BAD_DOMAINS_FILE, domains)