Esempio n. 1
0
def get_non_dga_domains():
    query_body = {
        "query": {
            "bool": {
                "must_not": [{
                    "query_string": {
                        "default_field": "info.Desc",
                        "query": "DGA"
                    }
                }]
            }
        }
    }
    bad_domains = set_mal_domain_index_params(query_body)
    print("len of bad_domains: %s" % len(bad_domains))
    file = FULL_DOM_DIR + "es_non_dga.txt"
    write2file(file, bad_domains)

    # non dga域名及其来源和类型写入文件中
    file2 = PRE_DIR + "es_non_dga_with_type.txt"
    domain_dict = set_mal_domain_index_params1(get_domains_with_type,
                                               query_body)
    for domain, info_tuple in domain_dict.items():
        print("domain:%s, source: %s, mal_type: %s" %
              (domain, info_tuple[0], info_tuple[1]))
    write_domain_with_type_2file(file2, domain_dict)
def test_mal_domains(db, domain_bad, recs):
    # query_body = {"ver_mal_sub_domains": {"$exists": False}}
    mongo_index = mongo_index_dict[domain_bad]
    notmal_count, iter = 0, 0
    not_mal_domains = []
    for iter, domain_dict in enumerate(recs):
        domain_2nd = domain_dict[DOMAIN_2ND_FIELD]
        sub_domains = domain_dict[SUBDOMAINS_FIELD]
        ver_sub_domains = domain_dict.get(VER_SUBDOMAINS_FIELD, [])
        print("handlering %s domain %s" % (iter, domain_2nd))

        if scan_url(domain_2nd):
            sub_domains = list(set(sub_domains) - set(ver_sub_domains))
            for sub_domain in sub_domains:
                # 如果三级子域名和二级域名相同,则不必检测,直接认定该三级子域名相同。
                if sub_domain == domain_2nd:
                    continue
                if not scan_url(sub_domain):
                    sub_domains.remove(sub_domain)
                    print("domain_2nd: %s, sub_domain: %s" %
                          (domain_2nd, sub_domain))
            save_mal_domains2mongodb(db, mongo_index, domain_2nd, sub_domains)
        else:
            # print("delete_not_mal_domain: %s" % (domain_2nd,))
            # delete_not_mal_domain(domain_2nd, db, mongo_index)  # 误判的恶意域名,不能直接删除,有时会将恶意域名看做是正常的
            not_mal_domains.append(domain_2nd)
            notmal_count += 1

    if notmal_count:
        print("notmal_count: %s" % (notmal_count, ))
    # 将非恶意域名写入文件中,后面删除
    write2file(NOT_MAL_DOM_FILE, not_mal_domains)
Esempio n. 3
0
def check_domains(domains, domain_bad, batch_num=50):
    i = 0
    domain_info_dict_list = []
    longest_substring_list = set()
    for domain in domains:
        i += 1
        domain_2nd = keep_2nd_dom_name(domain)
        domain_len = len(domain_2nd)
        n_digits, digit_segs, word_segs = word_segment(domain_2nd)
        digit_number_ratio = n_digits / len(domain)
        n_groups_of_digits = len(digit_segs)  # 整个二级域名字符串可以被多少组数字分隔开
        n_group_of_word_segs = len(
            word_segs)  # 整个二级域名中字符串最为被分为了多少组如w3cschool最后被分为三组:w, c,school
        longest_len, longest_substring = get_longest_meaningful_substring_v0(
            word_segs)  # 最长有意义字符串长度,最长有意义子串
        domain_name_entropy = cal_domain_name_entropy(domain)
        longest_substring_list.add(longest_substring)

        print('==============================================================')
        print('domain: {0}, domain_2nd: {1}, digit_segs: {2}, word_segs:{3}'.
              format(domain, domain_2nd, digit_segs, word_segs))
        print(
            'domain_2nd: {0}, n_digits: {1}, n_groups_digits: {2}, n_group_word_segs: {3}'
            .format(domain_2nd, n_digits, n_groups_of_digits,
                    n_group_of_word_segs))
        print('domain_2nd: {0}, longest_len:{1},longest_substring: {2}'.format(
            domain_2nd, longest_len, longest_substring))

        domain_info = {
            DOMAIN_2ND_FIELD: domain,
            DOMAIN_LEN: domain_len,
            DOMAIN_NAME_ENTROPY: domain_name_entropy,
            N_DIGITS: n_digits,
            DIGIT_NUMBER_RATIO: digit_number_ratio,
            N_GROUPS_OF_DIGITS: n_groups_of_digits,
            WORD_SEG_GROUP: n_group_of_word_segs,
            LONGEST_SUBSTRING_RATIO:
            longest_len / domain_len  # 最长有意义子串占整个字符串的比例
        }
        domain_info_dict_list.append(domain_info)
        if i % batch_num == 0 or i == len(domains):
            print('第{0}个域名正在统计'.format(i))
            print("==========domain_info==============")

    columns_fields = [
        DOMAIN_2ND_FIELD, DOMAIN_LEN, DOMAIN_NAME_ENTROPY, N_DIGITS,
        DIGIT_NUMBER_RATIO, N_GROUPS_OF_DIGITS, WORD_SEG_GROUP,
        LONGEST_SUBSTRING_RATIO
    ]
    domain_name_file = str(domain_bad) + "_" + DOMAIN_NAME_FEATURE_FILE
    write2csv(domain_info_dict_list, columns_fields, domain_name_file,
              DOMAIN_2ND_FIELD)

    longest_substring_file = str(domain_bad) + "_" + LONGEST_SUBSTRING_FILE
    remove_file(longest_substring_file)
    write2file(longest_substring_file, longest_substring_list)
def remove_duplicate_from_file(file):
    domain_set = set()
    with open(file) as f_out:
        lines = f_out.readlines()
        for line in lines:
            domain = line.strip("\n")
            domain_set.add(domain)
    remove_file(file)
    write2file(file, domain_set)
    print("%s unique bad domains" % (len(domain_set)))
Esempio n. 5
0
def test_domains(file, dst_file, choice=2):
    """
    测试恶意域名是否真的是恶意的,并将恶意域名写入dst_file指定的文件内
    :param file: 源文件,保存等待验证的域名
    :param dst_file: 恶意域名写入的文件
    :param choice: 2表示二级域名,3表示3级域名
    :return:
    """
    print("file: %s, dst_file: %s" % (file, dst_file))

    bad_domains = []
    i = 0
    batch_num = 5  # 批处理写入到文件dst_file中的数量

    try:
        with open(file, "r") as f_out:
            lines = f_out.readlines()
            if os.path.exists(dst_file):
                v_last_line = find_last_checked_lines(dst_file)
                # print("lines[235]: %s" % lines[235])
                pos1 = lines.index(v_last_line)
                if pos1 < len(lines):
                    lines = lines[pos1 + 1:]
            print("there is %s left to be handled" % (len(lines), ))

            for line in lines:
                print("==============================================")
                start_time = time.time()
                if len(bad_domains) >= batch_num:
                    print("bad_domains write to file")
                    write2file(dst_file, bad_domains)
                    bad_domains = []

                domain = line.strip("\n")
                bad_flag = scan_url(domain)
                if bad_flag:
                    print("add bad_domain: %s" % domain)
                    bad_domains.append(domain)
                if i & 1:
                    random_num = random.randint(5, 15)
                else:
                    random_num = random.randint(10, 20)
                i = 1 - i
                time.sleep(random_num)
                end_time = time.time()
                cost_time = end_time - start_time
                print("handle: %s,bad_flag: %s, cost_time: %s" %
                      (domain, bad_flag, cost_time))
    except Exception as e:
        print("error: %s" % e)
    finally:
        print("totally %s domains are bad!" % len(bad_domains))
        if bad_domains:
            write2file(dst_file, bad_domains)
def read_niclog_url_files(file_list, mal_domain_set):
    for file in file_list:
        start_time = time.time()
        unknown_domain_set = read_niclog_url_file(file)
        end_time = time.time()
        cost_time = end_time - start_time
        print("==================================================================")
        print("%s domains, size: %s Kbytes" % (len(unknown_domain_set), sys.getsizeof(unknown_domain_set) / 1024))
        print("cost_time: %s 秒" % (cost_time))
        insect_domains = unknown_domain_set & mal_domain_set
        print("%s bad domains found in file %s" % (len(insect_domains), file))
        write2file(BAD_URL_DOMAINS_FILE, insect_domains)
Esempio n. 7
0
def get_good_niclog_domain():
    """
    将在niclog中访问过的正常域名提取出来
    :return:
    """
    recs = db_basic[DOMAIN_BASIC_COL].find()
    domain_set = set()
    for rec in recs:
        domain = rec[DOMAIN_2ND_FIELD]
        domain_set.add(domain)
    print("len of domain_set: %s" % (len(domain_set)))
    write2file("good_niclog_url.txt", domain_set)
def read_file_list(dir, choice):
    """
    :param dir: 从指定目录下读取一些文件,这些文件中都是不同类型的恶意域名(全限定域名),将这些文件中的域名转换成
                两级或者三级域名后重新写入新的文件
    :param choice:choice=2表示保留2级域名,3表示保留3级域名
    :return:
    """
    files = os.listdir(dir)
    count = 0
    for file in files:
        domains_set = set()
        file_dir = FULL_DOM_DIR + file
        # print(file)
        domains = read_file(file_dir, choice)
        domains_set = domains_set | domains
        count += len(domains_set)
        file_prefix = file.split(".")[0]
        file = UVER_DOM_DIR + file_prefix + "_" + str(choice) + ".txt"
        remove_file(file)
        write2file(file, domains_set)
        print("write to file:%s" % file)

    print("totally %s domains converted to %s level domain" % (count, choice))
Esempio n. 9
0
def get_non_dga_domains():
    query_body = {"query": {"bool": {"must_not": [{"query_string": {"default_field": "info.Desc", "query": "DGA"}}]}}}
    bad_domains = set_mal_domain_index_params(query_body)
    print("len of bad_domains: %s" % len(bad_domains))
    file = FULL_DOM_DIR + "es_non_dga.txt"
    write2file(file, bad_domains)
Esempio n. 10
0
from common.mongodb_op import mongo_url
from common.mongodb_op import MAL_DOMS_MONGO_DB, MAL_DOMAINS_MONGO_INDEX
from common.mongo_common import DOMAIN_2ND_FIELD, MAL_TYPE, SOURCE_SIET
from common.domains_op import write2file
from get_visited_bad_domains_info.get_mal_domains_from_niclog import OLD_141_BAD_DOMAINS_FILE

client = MongoClient(mongo_url)


def show_visited_bad_domains(domains):
    """
    显示从niclog中能够匹配的141恶意域名及其来源、恶意类型等信息
    :return:
    """
    db = client[MAL_DOMS_MONGO_DB]
    mongo_index = MAL_DOMAINS_MONGO_INDEX
    for domain_2nd in domains:
        query_body = {DOMAIN_2ND_FIELD: domain_2nd}
        recs = db[mongo_index].find(query_body)
        mal_type = recs[0].get(MAL_TYPE, "unknown")
        source = recs[0].get(SOURCE_SIET, "unknown")
        print("domain %s captured, source: %s, type: %s" %
              (domain_2nd, source, mal_type))


if __name__ == '__main__':
    domain_bad = 1
    domains = get_visited_domains(domain_bad)
    show_visited_bad_domains(domains)
    write2file(OLD_141_BAD_DOMAINS_FILE, domains)