def run(self): filter=Filter() while (True): self.lock.acquire() if (self.file_path_queue.empty() == False): queue_item = self.file_path_queue.get() self.lock.release() ff=str(queue_item[0]) if ff.__contains__("-"): result_file_name=ff[ff.rindex("-")+1:ff.index(".txt.bz2")-2] else: result_file_name = ff[:ff.index(".txt.bz2") - 2] print(result_file_name) for file_path in queue_item: file_point = bz2.open(file_path, 'r') for line in file_point: try: line = line.decode().strip() linesplit = line.split(',') querydomain = linesplit[3].strip().lower() if(filter.isValidDomain(querydomain) and self.match.judge(querydomain)): self.match.saveFile(line) print(querydomain) except: print("error info:{}\n file:{}".format(traceback.print_exc(),file_path)) file_point.close() day_string=result_file_name[:len(result_file_name)-2] else: self.lock.release() break print('Processing ' + str(self.process_index) + ' is finished')
def run(self): filter = Filter() while (True): self.lock.acquire() if (self.file_path_queue.empty() == False): queue_item = self.file_path_queue.get() self.lock.release() ff = str(queue_item[0]) if ff.__contains__("-"): result_file_name = ff[ff.rindex("-") + 1:ff.index(".txt.bz2") - 2] else: result_file_name = ff[:ff.index(".txt.bz2") - 2] print(result_file_name) hour_result = [] hmap = dict() for file_path in queue_item: file_point = bz2.open(file_path, 'r') for line in file_point: try: line = line.decode().strip() linesplit = line.split(',') source_ip = linesplit[0].strip().lower() querydomain = linesplit[3].strip().lower() rcode = linesplit[16].strip() answer = linesplit[19].strip().lower() flag = hmap.get(querydomain) if flag is None: if filter.Two_Three_level_domain( querydomain) and filter.isValidDomain( querydomain): hour_result.append(",".join( (source_ip, querydomain, rcode, answer))) hmap[querydomain] = True else: hmap[querydomain] = False elif flag == True: hour_result.append(",".join( (source_ip, querydomain, rcode, answer))) else: continue except: print("error info:{}\n file:{}".format( traceback.print_exc(), file_path)) file_point.close() day_string = result_file_name[:len(result_file_name) - 2] if not os.path.exists("../result_data/" + day_string): os.mkdir("../result_data/" + day_string) with open("../result_data/{}/{}".format( day_string, result_file_name), mode="w", encoding="utf8") as f: f.write("\n".join(hour_result)) else: self.lock.release() break print('Processing ' + str(self.process_index) + ' is finished')
def getBenign(filepath): psl=PublicSuffixList() filter=Filter() domains=[] # out=dict() # with open(filepath,"r") as f: # for r in f: # r_split=r.strip().split(":") # if filter.inWhiteList(r_split[0]): # pri=psl.privatesuffix(r_split[0]) # lll=out.get(pri) # if lll is None: # lll=[] # lll.append(r_split[0]) # out[pri]=lll # continue # domains.append(r_split[0]) # # # num=0 # break_flag=False # for i in range(9): # for k,v in out.items(): # if i>=len(v) or k in ["aliyunduncc.com","360wzb.cn","yundunwaf.com","bugtags.com","wscloudcdn.com","ourdvsss.com","aliyundunwaf.com","aligfwaf.com"]: # continue # domains.append(v[i]) # num+=1 # if num>=311: # break_flag=True # break # if break_flag: # break with open(filepath,"r") as f: for r in f: r_split=r.strip().split(":") domains.append(r_split[0]) random.shuffle(domains) result=dict() result["train"]=domains[:23600] result["pred"]=domains[23600:29500] with open("../result_data/yd_nf_data.json","w") as f: f.write(json.dumps(result)) print(len(domains))
def run(hourPath): filter = Filter() pathMap = dict() for l in os.listdir(hourPath): #获得每5分钟的数据文件 timeStr = l[l.rindex("-") + 1:l.rindex(".txt.bz")] fiveMinFiles = pathMap.get(timeStr) if fiveMinFiles is None: fiveMinFiles = [] fiveMinFiles.append(l) pathMap[timeStr] = fiveMinFiles else: fiveMinFiles.append(l) # 读取数据 print("get all file names") for k, v in pathMap.items(): total = 0 ttl_map = dict() visit_map = dict() nx_map = dict() for bzf in v: file_point = bz2.open("{}/{}".format(hourPath, bzf), 'r') for line in file_point: total = total + 1 try: line = line.decode().strip() linesplit = line.split(',') querydomain = linesplit[3].strip().lower() if (filter.isValidDomain(querydomain) and filter.Two_Three_level_domain(querydomain)): visitIP = linesplit[0].strip() ttl = linesplit[10].strip() isMULL = linesplit[15].strip() # 存储访问关系 visitList = visit_map.get(querydomain) if visitList is None: visitList = set() visitList.add(visitIP) visit_map[querydomain] = visitList else: visitList.add(visitIP) if isMULL == "MULL": nxDomainList = nx_map.get(visitIP) if nxDomainList is None: nxDomainList = set() nxDomainList.add(querydomain) nx_map[visitIP] = nxDomainList else: nxDomainList.add(querydomain) else: # ttl ttlList = ttl_map.get(querydomain) if ttlList is None: ttlList = set() ttlList.add(ttl) ttl_map[querydomain] = ttlList else: ttlList.add(ttl) except: continue print("{} read finish".format(bzf)) print("total:{}".format(total)) print("all domains:{}".format(len(visit_map))) print("all active :{}".format(len(ttl_map))) print("all NX:{}".format(len(nx_map)))
def __init__(self): self.psl = PublicSuffixList(accept_unknown=False) self.filter = Filter()
class gdyd(): def __init__(self): self.psl = PublicSuffixList(accept_unknown=False) self.filter = Filter() def statistic_single_hour(self, hour_dir, day, hour: int): counter = Counter() for minute_file in os.listdir(hour_dir): bzfile = os.path.join(hour_dir, minute_file) try: file_point = bz2.open(bzfile, 'r') for line in file_point: try: line = line.decode().strip() linesplit = line.split(',') querydomain = linesplit[3].strip().lower() if self.filter.isValidDomain(querydomain): prisuf = self.psl.privatesuffix(querydomain) if prisuf is not None and prisuf not in self.filter.sf.AleaxTop and \ prisuf not in self.filter.sf.CDNSet and \ prisuf not in self.filter.sf.commonset: counter[prisuf] += 1 if prisuf != querydomain: front = querydomain[:querydomain. rindex(prisuf) - 1] front_s = front.rsplit(".", 1) if len(front_s) != 0: ThreeLD = "{}.{}".format( front_s[len(front_s) - 1], prisuf) counter[ThreeLD] += 1 except: pass file_point.close() except: print("error : {}".format(bzfile)) print("{} finish".format(bzfile)) print("{}{} write".format(day, hour)) with open("../result_data/temp/{}{}.json".format(day, hour), "w") as f: f.write(json.dumps(counter)) def all_day_counter( self, rootpath="/home/public/DNS_Project/pdns_gddx_compressed/gdyd", days=["20180502", "20180503", "20180504"]): s = time.time() number = 24 pool = Pool(number) # result = [] for day in days: daydir = os.path.join(rootpath, "dt={}".format(day)) for h in range(24): hourdir = os.path.join(daydir, "hour={0:02d}".format(h)) if os.path.exists(hourdir): pool.apply_async(func=self.statistic_single_hour, args=( hourdir, day, h, )) else: print("path error") # result.append(r) pool.close() pool.join() # whole_counter = Counter() # for r in result: # whole_counter.update(r.get()) # for r in whole_counter.most_common(30000): # print("{},{}".format(r[0], r[1])) e = time.time() print("spend time :{} minutes".format((e - s) / 60)) def get_counter(self, days=["20180502", "20180503", "20180504"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: counter = Counter() for i in range(24): path = os.path.join(root_dir, "{}{}.json".format(day, i)) if os.path.exists(path): with open(path, "r") as f: counter1 = Counter(json.loads(f.read())) counter.update(counter1) with open("{}{}.json".format(root_dir, day), "w") as f: f.write(json.dumps(counter)) def remove_file( self, days=["20180427", "20180428", "20180429", "20180430", "20180501"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: for i in range(24): path = os.path.join(root_dir, "{}{}.json".format(day, i)) if os.path.exists(path): os.remove(path) def getBenignDomains(self, days=["20180502", "20180503"]): root_dir = "/home/yandingkui/Pontus/result_data/temp/" for day in days: with open(os.path.join(root_dir, "{}.json".format(day)), "r") as f: counter = Counter(json.loads(f.read())) data = [] for item in counter.most_common(30000): data.append(item[0]) with open("../data_sets/yd_{}".format(day), "w") as F: F.write("\n".join(data)) def dxvsyd(self, days=["20180427", "20171031"]): yd = "/home/yandingkui/Pontus/result_data/temp/20180427.json" dx = "/home/yandingkui/Pontus/result_data/gddx/20171031.json" with open(yd, "r") as f: counter1 = Counter(json.loads(f.read())) with open(dx, "r") as f: counter2 = Counter(json.loads(f.read())) s1 = [] s2 = [] for item in counter1.most_common(30000): s1.append(item[0]) for item in counter2.most_common(30000): s2.append(item[0]) with open("../result_data/yd_20180427", "w") as f: f.write("\n".join(s1)) with open("../result_data/dx_20171031", "w") as f: f.write("\n".join(s2))
def run(self): filter = Filter() while (True): self.lock.acquire() if (self.file_path_queue.empty() == False): queue_item = self.file_path_queue.get() self.lock.release() # ff=str(queue_item[0]) # if ff.__contains__("-"): # result_file_name=ff[ff.rindex("-")+1:ff.index(".txt.bz2")-2] # else: # result_file_name = ff[:ff.index(".txt.bz2") - 2] # print(result_file_name) for file_path in queue_item: # print("is handling {}".format(file_path)) file_point = bz2.open(file_path, 'r') answerset = set() redis_domain = redis.Redis(host='127.0.0.1', port=6379, db=1) redis_ip = redis.Redis(host='127.0.0.1', port=6379, db=2) redis_CNAME = redis.Redis(host='127.0.0.1', port=6379, db=3) for line in file_point: try: line = line.decode().strip() linesplit = line.split(',') querydomain = linesplit[3].strip().lower() type = linesplit[4].strip() time = linesplit[11].strip() isMULL = linesplit[15].strip() answer = linesplit[19].strip().lower() keys = ",".join((querydomain, answer)) # print("key={},value={}".format(keys,querydomain)) if (type == 'A' and isMULL != 'MULL' and len(answer) > 0): if answerset.__contains__(keys): continue else: answerset.add(keys) ips = answer.split(";") ipv4_pattern = "(([0-9]|[1-9][0-9]|1[0-9]{2}|2[0-4][0-9]|25[0-5])\.){3}(25[0-5]|2[0-4][0-9]|1[0-9]{2}|[1-9][0-9]|[0-9])" for ip in ips: if re.match(ipv4_pattern, ip): redis_domain.sadd(querydomain, ip) redis_ip.hset( ip, querydomain, time) # print("domain map IP:{}{}".format(querydomain,ip)) else: redis_CNAME.sadd(querydomain, ip) # print("CNAME:{}{}".format(querydomain, ip)) except: print("error info:{}\n file:{}".format( traceback.print_exc(), file_path)) file_point.close() # print(result_file_name+" finish hahhahah") redis_domain.close() redis_ip.close() redis_CNAME.close() else: self.lock.release() break print('Processing ' + str(self.process_index) + ' is finished')