def read_niclog_url_file(file, trie, domain_bad): """ 通过比较domains中的域名进行匹配,统计每个域名的访问次数 :param file: :param domains: :return: """ r = r1 if domain_bad else r0 f_out = open(file) I = iter(f_out) file_total_line = 0 while True: try: file_total_line += 1 line = next(I) record = split_url_log_line(line) domain_2nd = record[0] timestamp = record[-1] exists = trie.search(domain_2nd) if exists and timestamp: dt_str = timestamp_str2ymdh(timestamp) index, dt_str_day = dt_str[-2:], dt_str[:-2] count_key = domain_2nd + "_" + dt_str_day + "_" + index r.incr(count_key) # print("domain_2nd: %s, exists: %s" % (domain_2nd, exists)) except StopIteration as e: # print("StopIteration %s" % (e)) break except Exception as e: # print("error read file %s for %s" % (file, e)) pass print("file %s totally has %s lines" % (file, file_total_line))
def set_vis_bad_domain_index_params(index_name_suffix, domain_2nd, ver_sub_domains): index_name = VIS_DOMAIN_INDEX_NAME_PREFIX + index_name_suffix print('domain_2nd: {0}, index_name: {1}'.format(domain_2nd, index_name)) doc_type = VIS_DOM_DOC_TYPE es = Elasticsearch(hosts=HOST) pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd query_body = { "query": { "bool": { "must": [{ "regexp": { "content": pattern } }, { "term": { "operation": "dnsquery3" } }] } } } if es.indices.exists(index_name): if query_body is None: query_body = {"query": {"match_all": {}}} gen = helpers.scan(es, index=index_name, doc_type=doc_type, query=query_body) # count = 0 # key = domain_3th + "_" + suffix # val_dict = {0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, # 12: 0, 13: 0, 14: 0, 15: 0, 16: 0, 17: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0} # r2.hmset(key, val_dict) for item in gen: item = item['_source'] timestamp = item['time-stamp'] dt_str = timestamp_str2ymdh(timestamp) index = int(dt_str[-2:]) dt_srt_day = dt_str[:-2] full_domain = item['content'] domain_3th = keep_3th_dom_name(full_domain) if domain_3th.find( domain_2nd) >= 0 and domain_3th in ver_sub_domains: # print("domain_2nd: %s, domain_3th: %s, dt_str: %s" % (domain_2nd, domain_3th, dt_srt_day)) mongo_query_body = { DOMAIN_2ND_FIELD: domain_2nd, DATE_FIELD: dt_srt_day } basic_body = {"$inc": {str(index): 1}} db_nic_bad_visiting[mongo_index_2nd].update( mongo_query_body, basic_body, True) mongo_query_body = { DOMAIN_3TH_FIELD: domain_3th, DATE_FIELD: dt_srt_day, DOMAIN_2ND_FIELD: domain_2nd } basic_body = {"$inc": {str(index): 1}} db_nic_bad_visiting[mongo_index_3th].update( mongo_query_body, basic_body, True)
def set_vis_bad_domain_index_params_loose(es, doc_type, index_name, domain_2nd, ver_sub_domains, domain_bad): pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd query_body = { "query": { "bool": { "must": [{ "regexp": { "content": pattern } }, { "term": { "operation": "dnsquery3" } }] } } } # 是否匹配此域名 matched = False if es.indices.exists(index_name): gen = helpers.scan(es, index=index_name, doc_type=doc_type, query=query_body, scroll='30m') for item in gen: item = item['_source'] timestamp = item['time-stamp'] dt_str = timestamp_str2ymdh(timestamp) index, dt_srt_day = int(dt_str[-2:]), dt_str[:-2] full_domain = item['content'].lower() domain_3th = keep_3th_dom_name(full_domain) # 完整域名中包含二级域名,且完整域名对应的二级域名在确认(正常或恶意的)子域名列表中 # 三级域名就是二级域名本书(pos==0),三级域名中包含这二级域名(pos>0) pos = domain_3th.find(domain_2nd) cond1 = pos > 0 and domain_3th[pos - 1] == '.' cond2 = pos == 0 or domain_3th == WWW_PHREASE + domain_2nd # 为什么恶意域名匹配了那么多,却只有少数恶意域名出现在时间序列中 cond4 = domain_3th in ver_sub_domains print( "full_domain: %s, domain_2nd: %s, con3: %s, con4:%s, con5: %s" % (full_domain, domain_2nd, cond1, cond4, cond2)) if cond1 or cond2: # 此域名符合二级域名的要求,但不在验证过的三级域名中 if not cond4: print( "domain: %s is valid doamin_3th but not in ver_subdomains" % (domain_3th)) if cond1 or cond2: print("domain_2nd: %s, domain_3th: %s, visit_day: %s" % (domain_2nd, domain_3th, dt_srt_day)) mongo_query_body = { DOMAIN_2ND_FIELD: domain_2nd, DATE_FIELD: dt_srt_day } basic_body = {"$inc": {str(index): 1}} mongo_index_2nd = domain_index_dict[domain_bad][2] db_nic_visiting[mongo_index_2nd].update( mongo_query_body, basic_body, True) mongo_query_body = { DOMAIN_3TH_FIELD: domain_3th, DATE_FIELD: dt_srt_day, DOMAIN_2ND_FIELD: domain_2nd } basic_body = {"$inc": {str(index): 1}} mongo_index_3th = domain_index_dict[domain_bad][3] db_nic_visiting[mongo_index_3th].update( mongo_query_body, basic_body, True) matched = True return matched
def search_domain_in_es(es, index_name, doc_type, domain_2nd): matched = False # 该域名是否在niclog中匹配 pattern = "([A-Za-z0-9-]?[A-Za-z0-9]+\.)?" + domain_2nd query_body = { "query": { "bool": { "must": [{ "regexp": { "content": pattern } }, { "term": { "operation": "dnsquery3" } }] } } } if es.indices.exists(index_name): gen = helpers.scan(es, index=index_name, doc_type=doc_type, query=query_body, scroll='30m') full_domains_visit_dict = {} sub_domains = set() # 三级域名 for item in gen: item = item['_source'] full_domain = item['content'] visitor = item["source-mac"] # 访问该域名的用户,使用mac地址标识 visit_date = item['time-stamp'] dt_str = timestamp_str2ymdh(visit_date) if full_domain not in full_domains_visit_dict: full_domains_visit_dict[full_domain] = { VISITORS: [], DT_STRS: [] } full_domains_visit_dict[full_domain][VISITORS].append(visitor) full_domains_visit_dict[full_domain][DT_STRS].append(dt_str) pos = full_domain.lower().find(domain_2nd.lower()) # 在全域名中包含了二级域名或者全域名就是二级域名本身,将全域名对应的三级域名加入到二级域名的子域名列表中 if pos == 0 or pos > 0 and full_domain[pos - 1] == '.': sub_domains.add(keep_3th_dom_name(full_domain)) # print("domain_2nd: %s, matched full_domain: %s" % (domain_2nd, keep_3th_dom_name(full_domain))) # 把匹配到的域名信息存入到MongoDB数据库中 sub_domains = list(sub_domains) if domain_bad: if len(sub_domains): save_domain_subdomains2mongodb( domain_2nd.lower(), sub_domains, db_nic_log, NIC_LOG_BAD_DOMAIN_SUBDOMAINS_MONGO_INDEX) for full_domain in full_domains_visit_dict: visitors = full_domains_visit_dict[full_domain]["visitors"] dt_strs = full_domains_visit_dict[full_domain]["dt_strs"] save_full_domains_visiting_records2mongodb( full_domain, db_nic_log, NIC_LOG_BAD_FULL_NAME_VISITING_MONGO_INDEX, dt_strs, visitors) else: # 当full_domains或者visitors数量过大时,无法一次插入到mongodb中,需要分成多次插入 batch_num = 400 if len(sub_domains) > batch_num: total = 0 while total < len(sub_domains): size = batch_num if len( sub_domains) - total > batch_num else len( sub_domains) - total save_domain_subdomains2mongodb( domain_2nd.lower(), sub_domains[total:total + size], db_nic_log, NIC_LOG_GOOD_DOMAIN_SUBDOMAINS_MONGO_INDEX) total += size else: save_domain_subdomains2mongodb( domain_2nd.lower(), sub_domains, db_nic_log, NIC_LOG_GOOD_DOMAIN_SUBDOMAINS_MONGO_INDEX) for full_domain in full_domains_visit_dict: visitors = full_domains_visit_dict[full_domain]["visitors"] dt_strs = full_domains_visit_dict[full_domain]["dt_strs"] # 当访问者数量过多时,批量插入,不是一次性插入,一次性插入可能会超过MongoDB单次允许插入的数量而出错 if len(visitors) > batch_num: print("len of visitors: %s" % (len(visitors))) total = 0 while total < len(visitors): size = batch_num if len( visitors) - total > batch_num else len( visitors) - total save_full_domains_visiting_records2mongodb( full_domain, db_nic_log, NIC_LOG_GOOD_FULL_NAME_VISITING_MONGO_INDEX, dt_strs[total:total + size], visitors[total:total + size]) total += size # print("total: %s" % (total)) # print("break while total: %s" % (total)) else: save_full_domains_visiting_records2mongodb( full_domain, db_nic_log, NIC_LOG_GOOD_FULL_NAME_VISITING_MONGO_INDEX, dt_strs, visitors) if len(sub_domains): matched = True print( "==============domain: %s matched, len(sub_domains): %s================" % (domain_2nd, len(sub_domains))) return matched