def check_domains(domains, batch_num=50):
    domain_info_list = []
    i = 0
    for domain in domains:
        i += 1
        domain_2nd = keep_2nd_dom_name(domain)
        n_digits, digit_segs, word_segs = word_segment(domain_2nd)
        n_groups_of_digits = len(digit_segs)  # 整个二级域名字符串可以被多少组数字分隔开
        n_group_of_word_segs = len(
            word_segs)  # 整个二级域名中字符串最为被分为了多少组如w3cschool最后被分为三组:w, c,school
        longest_len, longest_substring = get_longest_meaningful_substring_v0(
            word_segs)  # 最长有意义字符串长度,最长有意义子串
        print('==============================================================')
        print('domain: {0}, domain_2nd: {1}, digit_segs: {2}, word_segs:{3}'.
              format(domain, domain_2nd, digit_segs, word_segs))
        print(
            'domain_2nd: {0}, n_digits: {1}, n_groups_digits: {2}, n_group_word_segs: {3}'
            .format(domain_2nd, n_digits, n_groups_of_digits,
                    n_group_of_word_segs))
        print('domain_2nd: {0}, longest_len:{1},longest_substring: {2}'.format(
            domain_2nd, longest_len, longest_substring))
        domain_info_list.append(
            (domain, domain_2nd, n_digits, n_groups_of_digits,
             n_group_of_word_segs, longest_len, longest_substring))
        if i % batch_num == 0 or i == len(domains):
            save2database(domain_info_list)
            domain_info_list = []
            print('第{0}个域名正在统计'.format(i))
def tackle_line(line):
    line = line.strip("\n")
    line = json.loads(line)
    domain = line.get(HOST_NAME, "")
    domain_2nd = ""
    if domain and not is_domain_ip(domain):
        domain_2nd = keep_2nd_dom_name(domain)
    return domain_2nd
def csv2csv(file, url_col, time_col, type_col=-1, pattern="2019"):
    """
    把hosts_phishtank.csv文件转为txt文件
        src_file: 如src_file = ROOT_DIR + "/" + "hosts_phishtank.csv"
        dst_file: dst_file = ROOT_DIR + "/" + "hosts_phishtank.txt"
    :param file:
    :param time_row: 时间在csv文件中的列
    :param pattern:
    :return:
    """
    file_prefix = file.split(".")[0]
    src_file = PRE_DIR + file
    dst_file = PRE_DIR + file_prefix + "_simple_" + pattern + ".csv"
    dst_file1 = PRE_DIR + file_prefix + "_simple_" + pattern + ".txt"
    http_phrase = "http://"
    port_phrase = ":"
    domains_list, domains_type = [], []
    domain_2nd_list = []
    if not os.path.exists(src_file):
        return
    print("src_file: %s" % (src_file,))
    with open(src_file, "r") as f_out:
        f_csv = csv.reader(f_out)
        for row in f_csv:
            url = row[url_col]

            # 只要2019年的恶意域名记录
            find_time = row[time_col].strip(" ")
            if find_time[:len(pattern)].find(pattern) < 0:
                continue

            if url.find(http_phrase) >= 0:
                pos = url.find(http_phrase) + len(http_phrase)
                url = url[pos:]
                pos = url.find("/")
                if pos >= 0:
                    url = url[:pos]
                if not is_domain_ip(url):
                    # print("url %s, row[1]: %s" % (url, row[1]))
                    if url.find(port_phrase) >= 0:
                        pos = url.find(port_phrase)
                        url = url[:pos]
                    if url not in domains_list:
                        domains_list.append(url)

                        # 文件中不存在类型这一列,即文件为hosts_phishtank.csv,对应的type为钓鱼网站
                        if type_col == -1:
                            type = "phishing"
                        else:
                            type = row[type_col]
                        domains_type.append(type)

                        # 为了给二级域名添加响应的恶意类型时需要
                        domain_2nd_list.append(keep_2nd_dom_name(url))

    print("len of urls : %s, len of domains_info: %s" % (len(domains_list), len(domains_type)))
    write2csv(dst_file, domain_2nd_list, domains_type)
    write2file(dst_file1, domains_list)
Ejemplo n.º 4
0
def check_domains(domains, domain_bad, batch_num=50):
    i = 0
    domain_info_dict_list = []
    longest_substring_list = set()
    for domain in domains:
        i += 1
        domain_2nd = keep_2nd_dom_name(domain)
        domain_len = len(domain_2nd)
        n_digits, digit_segs, word_segs = word_segment(domain_2nd)
        digit_number_ratio = n_digits / len(domain)
        n_groups_of_digits = len(digit_segs)  # 整个二级域名字符串可以被多少组数字分隔开
        n_group_of_word_segs = len(
            word_segs)  # 整个二级域名中字符串最为被分为了多少组如w3cschool最后被分为三组:w, c,school
        longest_len, longest_substring = get_longest_meaningful_substring_v0(
            word_segs)  # 最长有意义字符串长度,最长有意义子串
        domain_name_entropy = cal_domain_name_entropy(domain)
        longest_substring_list.add(longest_substring)

        print('==============================================================')
        print('domain: {0}, domain_2nd: {1}, digit_segs: {2}, word_segs:{3}'.
              format(domain, domain_2nd, digit_segs, word_segs))
        print(
            'domain_2nd: {0}, n_digits: {1}, n_groups_digits: {2}, n_group_word_segs: {3}'
            .format(domain_2nd, n_digits, n_groups_of_digits,
                    n_group_of_word_segs))
        print('domain_2nd: {0}, longest_len:{1},longest_substring: {2}'.format(
            domain_2nd, longest_len, longest_substring))

        domain_info = {
            DOMAIN_2ND_FIELD: domain,
            DOMAIN_LEN: domain_len,
            DOMAIN_NAME_ENTROPY: domain_name_entropy,
            N_DIGITS: n_digits,
            DIGIT_NUMBER_RATIO: digit_number_ratio,
            N_GROUPS_OF_DIGITS: n_groups_of_digits,
            WORD_SEG_GROUP: n_group_of_word_segs,
            LONGEST_SUBSTRING_RATIO:
            longest_len / domain_len  # 最长有意义子串占整个字符串的比例
        }
        domain_info_dict_list.append(domain_info)
        if i % batch_num == 0 or i == len(domains):
            print('第{0}个域名正在统计'.format(i))
            print("==========domain_info==============")

    columns_fields = [
        DOMAIN_2ND_FIELD, DOMAIN_LEN, DOMAIN_NAME_ENTROPY, N_DIGITS,
        DIGIT_NUMBER_RATIO, N_GROUPS_OF_DIGITS, WORD_SEG_GROUP,
        LONGEST_SUBSTRING_RATIO
    ]
    domain_name_file = str(domain_bad) + "_" + DOMAIN_NAME_FEATURE_FILE
    write2csv(domain_info_dict_list, columns_fields, domain_name_file,
              DOMAIN_2ND_FIELD)

    longest_substring_file = str(domain_bad) + "_" + LONGEST_SUBSTRING_FILE
    remove_file(longest_substring_file)
    write2file(longest_substring_file, longest_substring_list)
Ejemplo n.º 5
0
def read_domain_txt(txt_file, dst_file):
    """
    将源文件txt_file中的域名与恶意类型提取出来,写入到dst_file文件中
    :param txt_file:
    :param dst_file:
    :return:
    """
    domain_tuple_list = []
    not_allowed_phreas = ("#",)
    with open(txt_file) as f_out:
        for line in f_out.readlines():
            line = line.strip("\n")
            line_list = [item for item in line.split("\t") if item and item not in not_allowed_phreas]

            type = line_list[1]
            domain_2nd = keep_2nd_dom_name(line_list[0])
            if len(line_list) >= 3:
                source = line_list[2]
                if source in allowed_sources:
                    domain_tuple_list.append((domain_2nd, type, source))
                else:
                    domain_tuple_list.append((domain_2nd, type, ""))
                # print("domain: %s, type: %s, source: %s" % (domain_2nd, type, source))
            if len(line_list) % 4 == 0 and len(line_list) > 4:
                for i in range(1, (len(line_list) // 4) - 1):
                    domain_2nd = keep_2nd_dom_name(line_list[4 * i])
                    if re.search("\d+.", domain_2nd):
                        continue
                    type = line_list[4 * i + 1]
                    source = line_list[4 * i + 2]
                    if source in allowed_sources:
                        domain_tuple_list.append((domain_2nd, type, source))
                    else:
                        domain_tuple_list.append((domain_2nd, type, ""))
            # print("line_list: %s" % (line_list,))
    with open(dst_file, "a+") as f_in:
        for domain_2nd, type, source in domain_tuple_list:
            f_in.write(domain_2nd + "," + type + "\n")
def delete_not_visited_domains_v1(db, domain_bad, old_domain_set):
    """
    从bad_full_domains_visiting_records或good_full_domains_visiting_records中查找访问过的二级域名
    :param db:
    :param domain_bad:
    :param old_domain_set: 从Niclog中匹配的二级域名,存在bad_domain_subdomain中
    :return:
    """
    mongo_index = visiting_mongo_index_dict[domain_bad]
    recs = db[mongo_index].find()
    domain_2nd_set = set()
    for rec in recs:
        full_domain = rec[FULL_DOMAIN]
        domain_2nd = keep_2nd_dom_name(full_domain)
        domain_2nd_set.add(domain_2nd)
    domain_2nd_set = domain_2nd_set & old_domain_set
    print("len of visited domain_2nd: %s" % (len(domain_2nd_set)))
def format_domain_name(domain_name):
    domain_name = domain_name.lower()
    domain_2nd = keep_2nd_dom_name(domain_name)
    return domain_2nd