def do_string_distance_cluster(self, domain_list, job_index): """ 对粗粒度聚类结果的每个簇进行进一步的字符串相似度聚类 :param domain_list: 输入的domain列表 :param job_index: 子进程编号: :return: """ if not isinstance(domain_list, list): raise ValueError( "do_string_distance_cluster: domain_list should be list not %s" % str(type(domain_list))) cluster_list = list() cluster_done = set() # for each sub cluster of k-mean results for index, domain_0 in enumerate(domain_list): if domain_0 in cluster_done: continue cluster = [domain_0] cluster_done.add(domain_0) for domain_1 in domain_list[index + 1:]: if domain_1 not in cluster_done and self._core_distance_check( domain_0, domain_1): cluster.append(domain_1) cluster_done.add(domain_1) cluster_list.append(cluster) if index % CLUSTER_REPORT_FQ == 0: logger.info("batch:%d %d/%d" % (job_index, index, len(domain_list))) return cluster_list
def do_publish_domain_regex(self, input_path): """ 对数据库中的正则表达式进行跟新 :param input_path: 正则表达式文件路径 :return: """ regex_list = load_regex_list(input_path) try: self.connect_db() except Exception as err: logger.error("%s: publish_domain_regex: do_publish_domain_regex: %s" % (module_name, err)) sys.exit(1) for regex in regex_list: data_dict = dict() data_dict["_id"] = self.std_md5(regex) if not self.table.get(data_dict): data_dict["regex"] = regex data_dict["add_time"] = str(datetime.datetime.now()) data_dict["update_time"] = data_dict["add_time"] data_dict["source_type"] = malicious_type_malware response = self.table.add_data(data_dict) else: update_dict = dict() update_dict["update_time"] = str(datetime.datetime.now()) response = self.table.update_data(data_dict, update_dict, False) if not response: logger.error("%s: publish_domain_regex: do_publish_domain_regex: \ mongodb write error when try to write data %s" % (module_name, str(data_dict))) logger.info("domain regex data has been update")
def make_kmeans_cluster(self, domain_list, n_cluster): """ :param domain_list: :param n_cluster: :return: """ # normalize input if not isinstance(domain_list, list): raise ValueError( "make_kmeans_cluster: input should be list not %s" % (str(type(domain_list)))) if n_cluster <= 1: return [domain_list] vec_obj = DomainVectorize() df_vector = vec_obj.do_vectorize(domain_list) # make kmeans cluster kmeans = KMeans(n_clusters=n_cluster, verbose=0, random_state=0, n_jobs=N_JOBS_CLUSTERING) kmeans.fit(df_vector.values) df_vector["labels"] = kmeans.labels_ # dump results res = list() for index in range(n_cluster): df_cluster = df_vector.loc[df_vector["labels"] == index] res.append(list(df_cluster.index)) logger.info("k-means cluster done!") return res
def domain_regex_extract(): # 检查路径 train_urls_file = os.path.join(WORK_PATH, "train_urls.csv") cluster_file = os.path.join(WORK_PATH, "cluster_distance.json") domain_regex_file = os.path.join(WORK_PATH, "domain_regex.txt") if not os.path.isdir(WORK_PATH): os.makedirs(WORK_PATH) logger.info("mkdir %s" % WORK_PATH) # 获取恶意链接 # get_data_obj = GetMaliceLink() # get_data_obj.do_get_malice_link(train_urls_file) # malice_link_list = load_urls(train_urls_file) # logger.info("%s: main: malice_link get" % (module_name)) malice_link_list = load_urls("../std_data/mal.csv") # 进行预处理 process_obj = UrlPreprocess() malice_domain_list = process_obj.do_url_preprocess(malice_link_list) logger.info("%s: main: preprocess complete" % (module_name)) # 域名聚类 cluster_obj = DomainCluster() cluster_obj.do_make_domain_clustering(malice_domain_list , cluster_file ) logger.info("%s: main: clustering complete" % (module_name)) # 正则表达式抽取 extract_obj = DomainRegexExtract() extract_obj.do_domain_regex_extract(cluster_file, domain_regex_file) logger.info("%s: main: regex extract complete" % (module_name))
def load_urls(file_path): """ 读取URL数据,csv格式 :param file_path: 读取路径 :return: """ try: df = pd.read_csv(file_path) urls = list(df.url) logger.info("%s: urls has been load\t%s" % (module_name, file_path)) return urls except Exception as err: logger.error("%s: urls load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def build_domain_regex(self, token_tree): """ 将toke tree拼装为正则表达式 :param token_tree: :return: """ token_regex_list = [] for level in range(len(token_tree) - 1, -1, -1): token_regex = "|".join(token_tree[level]) if len(token_tree[level]) == 1: token_regex_list.append(token_regex) else: token_regex_list.append("".join(["(:?", token_regex, ")"])) domain_regex = "\.".join(token_regex_list) domain_regex = "".join(["^", domain_regex, "$"]) logger.info("%s: raw regex %s" % (module_name, domain_regex)) return domain_regex
def dump_urls(urls, file_path): """ 保存URL数据,csv格式 :param urls: 待保存URL列表 :param file_path: 保存路径 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: df = pd.DataFrame({"url": urls}) df.to_csv(file_path, index=False) logger.info("%s: urls has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: urls dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def load_regex_list(file_path): """ 读取正则表达式列表 :param file_path: 读取路径 :return: 正则表达式列表 """ regex = [] try: with open(file_path, 'r') as fd: for line in fd: regex.append(line.strip()) logger.info("%s: regex data has been load\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: regex load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE) return regex
def load_cluster_data(file_path): """ 读取聚类结果数据,json格式 :param file_path: 读取路径 :return: 聚类结果,列表 """ cluster = [] try: with open(file_path, 'r') as fd: for line in fd: cluster.append(json.loads(line.strip())) logger.info("%s: cluster data has been load\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: cluster data load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE) cluster = [_[_.keys()[0]] for _ in cluster] return cluster
def dump_cluster_data(file_path, cluster_list): """ 保存聚类结果数据,json格式 :param file_path: 保存路径 :param cluster_list: 聚类结果,列表 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: with open(file_path, "w") as fd: for index, cluster in enumerate(cluster_list): fd.write(json.dumps({index: cluster}) + '\n') logger.info("%s: cluster data has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: cluster data dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def dump_regex_list(regex_list, file_path): """ 保存正则表达式列表 :param file_path: 保存路径 :param regex_list: 正则表达式列表 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: with open(file_path, 'w') as fd: for regex in regex_list: fd.write(regex + '\n') logger.info("%s: regex data has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: regex dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def do_vectorize(self, domain_list): """ :param domain_list: :return: """ if not isinstance(domain_list, list): raise ValueError("input must list not %s" % str(type(domain_list))) res = list() for index, domain in enumerate(domain_list): res.append(self._core_vectorize(domain)) res = np.concatenate(res, axis=0) df = pd.DataFrame(res) df['domain'] = domain_list df = df.set_index('domain') logger.info("%s: vectorize: vectorization complete! data shape:\t%s" % (module_name, str(df.values.shape))) return df
def do_url_preprocess(self, url_list): """ URL训练预处理 :param url_list: :return: """ domain_list = list() st_time = time.time() for url in url_list: try: domain_list.append(self._core_preprocess(url)) except Exception as err: logger.error("%s: preprocess: preprocess url %s error %s" % (module_name, url, str(err))) domain_list = list(set([domain for domain in domain_list if domain])) end_time = time.time() logger.info( "%s: [statistic] url preprocess time cost:%f\tdomain count:%d" % (module_name, (end_time - st_time), len(domain_list))) return domain_list
def do_make_domain_clustering(self, domain_list, output_path): """ :param domain_list: :param output_path: :return: """ st_time = time.time() if not isinstance(domain_list, list): raise ValueError( "do_make_domain_clustering: should be list not %s" % (str(type(domain_list)))) preliminary_n_cluster = len(domain_list) / KMEANS_SIZE_LIMIT + 1 cluster_list = self.make_kmeans_cluster( domain_list, n_cluster=preliminary_n_cluster) logger.info("Preliminary K-means clustering complete") while True: if self._core_check_cluster_size(cluster_list): break self.make_string_distance_cluster(cluster_list, output_path) end_time = time.time() logger.info("%s: [statistic] domain clustering time cost:%f" % (module_name, (end_time - st_time)))
def show(self): ''' 显示所有数据 ''' for key in self.__dict__: logger.info('%s : %s', key, str(self.__dict__[key]))
def do_domain_regex_extract(self, input_path, output_path): """ 域名正则表达抽取主函数 :param input_path: :param output_path: :return: """ st_time = time.time() # 对聚类结果进行统计并写入日志 cluster_list = load_cluster_data(input_path) size_list = [len(cluster) for cluster in cluster_list] logger.info("%s: total cluster num:\t%d" % (module_name, len(cluster_list))) logger.info("%s: single one:\t%d" % (module_name, len([1 for size in size_list if size == 1]))) logger.info("%s: small cluster:\t%d" % (module_name, len([1 for size in size_list if SMALL_CLUSTER_SIZE <= size < BIG_CLUSTER_SIZE]))) logger.info("%s: big cluster:\t%d" % (module_name, len([1 for size in size_list if size >= BIG_CLUSTER_SIZE]))) # 过滤过小的簇 cluster_list = [cluster for cluster in cluster_list if len(cluster) >= DOMAIN_CLUSTER_SIZE_THRESH] # 抽取正则表达式 domain_token_tree_list = [self.build_domain_token_tree(cluster) for cluster in cluster_list] domain_regex_list = list(set([self.build_domain_regex(token_tree) for token_tree in domain_token_tree_list])) domain_regex_list = self.domain_regex_deduplicate(domain_regex_list) end_time = time.time() # 将结果写入日志 logger.info("%s: extract regex count:\t%d" % (module_name, len(domain_regex_list))) logger.info("%s: [statistic] domain regex extract time cost:%f" % (module_name, (end_time - st_time))) dump_regex_list(domain_regex_list, output_path)