Example #1
0
def read_niclog_url_file(file, trie, domain_bad):
    """
    通过比较domains中的域名进行匹配,统计每个域名的访问次数
    :param file:
    :param domains:
    :return:
    """
    r = r1 if domain_bad else r0

    f_out = open(file)
    I = iter(f_out)
    file_total_line = 0
    while True:
        try:
            file_total_line += 1
            line = next(I)
            record = split_url_log_line(line)
            domain_2nd = record[0]
            timestamp = record[-1]
            exists = trie.search(domain_2nd)

            if exists and timestamp:
                dt_str = timestamp_str2ymdh(timestamp)
                index, dt_str_day = dt_str[-2:], dt_str[:-2]
                count_key = domain_2nd + "_" + dt_str_day + "_" + index
                r.incr(count_key)
                # print("domain_2nd: %s, exists: %s" % (domain_2nd, exists))
        except StopIteration as e:
            # print("StopIteration %s" % (e))
            break
        except Exception as e:
            # print("error read file %s for %s" % (file, e))
            pass
    print("file %s totally has %s lines" % (file, file_total_line))
Example #2
0
def set_vis_bad_domain_index_params(index_name_suffix, domain_2nd,
                                    ver_sub_domains):
    index_name = VIS_DOMAIN_INDEX_NAME_PREFIX + index_name_suffix
    print('domain_2nd: {0}, index_name: {1}'.format(domain_2nd, index_name))
    doc_type = VIS_DOM_DOC_TYPE
    es = Elasticsearch(hosts=HOST)
    pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd
    query_body = {
        "query": {
            "bool": {
                "must": [{
                    "regexp": {
                        "content": pattern
                    }
                }, {
                    "term": {
                        "operation": "dnsquery3"
                    }
                }]
            }
        }
    }
    if es.indices.exists(index_name):
        if query_body is None:
            query_body = {"query": {"match_all": {}}}
        gen = helpers.scan(es,
                           index=index_name,
                           doc_type=doc_type,
                           query=query_body)
        # count = 0
        # key = domain_3th + "_" + suffix
        # val_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0,
        #             12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0}
        # r2.hmset(key, val_dict)
        for item in gen:
            item = item['_source']
            timestamp = item['time-stamp']
            dt_str = timestamp_str2ymdh(timestamp)
            index = int(dt_str[-2:])
            dt_srt_day = dt_str[:-2]
            full_domain = item['content']
            domain_3th = keep_3th_dom_name(full_domain)
            if domain_3th.find(
                    domain_2nd) >= 0 and domain_3th in ver_sub_domains:
                # print("domain_2nd: %s, domain_3th: %s, dt_str: %s" % (domain_2nd, domain_3th, dt_srt_day))
                mongo_query_body = {
                    DOMAIN_2ND_FIELD: domain_2nd,
                    DATE_FIELD: dt_srt_day
                }
                basic_body = {"$inc": {str(index): 1}}
                db_nic_bad_visiting[mongo_index_2nd].update(
                    mongo_query_body, basic_body, True)
                mongo_query_body = {
                    DOMAIN_3TH_FIELD: domain_3th,
                    DATE_FIELD: dt_srt_day,
                    DOMAIN_2ND_FIELD: domain_2nd
                }
                basic_body = {"$inc": {str(index): 1}}
                db_nic_bad_visiting[mongo_index_3th].update(
                    mongo_query_body, basic_body, True)
Example #3
0
def set_vis_bad_domain_index_params_loose(es, doc_type, index_name, domain_2nd,
                                          ver_sub_domains, domain_bad):
    pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd
    query_body = {
        "query": {
            "bool": {
                "must": [{
                    "regexp": {
                        "content": pattern
                    }
                }, {
                    "term": {
                        "operation": "dnsquery3"
                    }
                }]
            }
        }
    }

    # 是否匹配此域名
    matched = False
    if es.indices.exists(index_name):
        gen = helpers.scan(es,
                           index=index_name,
                           doc_type=doc_type,
                           query=query_body,
                           scroll='30m')
        for item in gen:
            item = item['_source']
            timestamp = item['time-stamp']
            dt_str = timestamp_str2ymdh(timestamp)
            index, dt_srt_day = int(dt_str[-2:]), dt_str[:-2]
            full_domain = item['content'].lower()
            domain_3th = keep_3th_dom_name(full_domain)

            # 完整域名中包含二级域名,且完整域名对应的二级域名在确认(正常或恶意的)子域名列表中
            # 三级域名就是二级域名本书(pos==0),三级域名中包含这二级域名(pos>0)
            pos = domain_3th.find(domain_2nd)
            cond1 = pos > 0 and domain_3th[pos - 1] == '.'
            cond2 = pos == 0 or domain_3th == WWW_PHREASE + domain_2nd

            # 为什么恶意域名匹配了那么多,却只有少数恶意域名出现在时间序列中
            cond4 = domain_3th in ver_sub_domains
            print(
                "full_domain: %s, domain_2nd: %s, con3: %s, con4:%s, con5: %s"
                % (full_domain, domain_2nd, cond1, cond4, cond2))
            if cond1 or cond2:  # 此域名符合二级域名的要求,但不在验证过的三级域名中
                if not cond4:
                    print(
                        "domain: %s is valid doamin_3th but not in ver_subdomains"
                        % (domain_3th))

            if cond1 or cond2:
                print("domain_2nd: %s, domain_3th: %s, visit_day: %s" %
                      (domain_2nd, domain_3th, dt_srt_day))
                mongo_query_body = {
                    DOMAIN_2ND_FIELD: domain_2nd,
                    DATE_FIELD: dt_srt_day
                }
                basic_body = {"$inc": {str(index): 1}}
                mongo_index_2nd = domain_index_dict[domain_bad][2]
                db_nic_visiting[mongo_index_2nd].update(
                    mongo_query_body, basic_body, True)
                mongo_query_body = {
                    DOMAIN_3TH_FIELD: domain_3th,
                    DATE_FIELD: dt_srt_day,
                    DOMAIN_2ND_FIELD: domain_2nd
                }
                basic_body = {"$inc": {str(index): 1}}
                mongo_index_3th = domain_index_dict[domain_bad][3]
                db_nic_visiting[mongo_index_3th].update(
                    mongo_query_body, basic_body, True)

                matched = True
    return matched
def search_domain_in_es(es, index_name, doc_type, domain_2nd):
    matched = False  # 该域名是否在niclog中匹配
    pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd
    query_body = {
        "query": {
            "bool": {
                "must": [{
                    "regexp": {
                        "content": pattern
                    }
                }, {
                    "term": {
                        "operation": "dnsquery3"
                    }
                }]
            }
        }
    }
    if es.indices.exists(index_name):
        gen = helpers.scan(es,
                           index=index_name,
                           doc_type=doc_type,
                           query=query_body,
                           scroll='30m')
        full_domains_visit_dict = {}
        sub_domains = set()  # 三级域名
        for item in gen:
            item = item['_source']
            full_domain = item['content']
            visitor = item["source-mac"]  # 访问该域名的用户,使用mac地址标识
            visit_date = item['time-stamp']
            dt_str = timestamp_str2ymdh(visit_date)
            if full_domain not in full_domains_visit_dict:
                full_domains_visit_dict[full_domain] = {
                    VISITORS: [],
                    DT_STRS: []
                }
            full_domains_visit_dict[full_domain][VISITORS].append(visitor)
            full_domains_visit_dict[full_domain][DT_STRS].append(dt_str)
            pos = full_domain.lower().find(domain_2nd.lower())

            # 在全域名中包含了二级域名或者全域名就是二级域名本身,将全域名对应的三级域名加入到二级域名的子域名列表中
            if pos == 0 or pos > 0 and full_domain[pos - 1] == '.':
                sub_domains.add(keep_3th_dom_name(full_domain))
                # print("domain_2nd: %s, matched full_domain: %s" % (domain_2nd, keep_3th_dom_name(full_domain)))

        # 把匹配到的域名信息存入到MongoDB数据库中
        sub_domains = list(sub_domains)
        if domain_bad:
            if len(sub_domains):
                save_domain_subdomains2mongodb(
                    domain_2nd.lower(), sub_domains, db_nic_log,
                    NIC_LOG_BAD_DOMAIN_SUBDOMAINS_MONGO_INDEX)
            for full_domain in full_domains_visit_dict:
                visitors = full_domains_visit_dict[full_domain]["visitors"]
                dt_strs = full_domains_visit_dict[full_domain]["dt_strs"]
                save_full_domains_visiting_records2mongodb(
                    full_domain, db_nic_log,
                    NIC_LOG_BAD_FULL_NAME_VISITING_MONGO_INDEX, dt_strs,
                    visitors)
        else:
            # 当full_domains或者visitors数量过大时,无法一次插入到mongodb中,需要分成多次插入
            batch_num = 400
            if len(sub_domains) > batch_num:
                total = 0
                while total < len(sub_domains):
                    size = batch_num if len(
                        sub_domains) - total > batch_num else len(
                            sub_domains) - total
                    save_domain_subdomains2mongodb(
                        domain_2nd.lower(), sub_domains[total:total + size],
                        db_nic_log, NIC_LOG_GOOD_DOMAIN_SUBDOMAINS_MONGO_INDEX)
                    total += size
            else:
                save_domain_subdomains2mongodb(
                    domain_2nd.lower(), sub_domains, db_nic_log,
                    NIC_LOG_GOOD_DOMAIN_SUBDOMAINS_MONGO_INDEX)
            for full_domain in full_domains_visit_dict:
                visitors = full_domains_visit_dict[full_domain]["visitors"]
                dt_strs = full_domains_visit_dict[full_domain]["dt_strs"]

                # 当访问者数量过多时,批量插入,不是一次性插入,一次性插入可能会超过MongoDB单次允许插入的数量而出错
                if len(visitors) > batch_num:
                    print("len of visitors: %s" % (len(visitors)))
                    total = 0
                    while total < len(visitors):
                        size = batch_num if len(
                            visitors) - total > batch_num else len(
                                visitors) - total
                        save_full_domains_visiting_records2mongodb(
                            full_domain, db_nic_log,
                            NIC_LOG_GOOD_FULL_NAME_VISITING_MONGO_INDEX,
                            dt_strs[total:total + size],
                            visitors[total:total + size])
                        total += size
                    #     print("total: %s" % (total))
                    # print("break while total: %s" % (total))
                else:
                    save_full_domains_visiting_records2mongodb(
                        full_domain, db_nic_log,
                        NIC_LOG_GOOD_FULL_NAME_VISITING_MONGO_INDEX, dt_strs,
                        visitors)

        if len(sub_domains):
            matched = True
            print(
                "==============domain: %s matched, len(sub_domains): %s================"
                % (domain_2nd, len(sub_domains)))
    return matched