def do_publish_domain_regex(self, input_path): """ 对数据库中的正则表达式进行跟新 :param input_path: 正则表达式文件路径 :return: """ regex_list = load_regex_list(input_path) try: self.connect_db() except Exception as err: logger.error("%s: publish_domain_regex: do_publish_domain_regex: %s" % (module_name, err)) sys.exit(1) for regex in regex_list: data_dict = dict() data_dict["_id"] = self.std_md5(regex) if not self.table.get(data_dict): data_dict["regex"] = regex data_dict["add_time"] = str(datetime.datetime.now()) data_dict["update_time"] = data_dict["add_time"] data_dict["source_type"] = malicious_type_malware response = self.table.add_data(data_dict) else: update_dict = dict() update_dict["update_time"] = str(datetime.datetime.now()) response = self.table.update_data(data_dict, update_dict, False) if not response: logger.error("%s: publish_domain_regex: do_publish_domain_regex: \ mongodb write error when try to write data %s" % (module_name, str(data_dict))) logger.info("domain regex data has been update")
def _core_vectorize(self, domain): res = np.zeros((1, ASCII_SIZE), dtype=np.int32) if not isinstance(domain, str): logger.error("%s: vectorize: input url is not string" % (module_name)) return res for char in domain: if ord(char) < ASCII_SIZE: res[0][ord(char)] += 1 return res
def string_cluster_subpro(domain_list, job_index): """ 字符串聚类子进程 :param domain_list 域名簇: :param job_index: 子进程编号 :return: """ str_cluster_obj = StringClustering() cluster_list = list() try: cluster_list = str_cluster_obj.do_string_distance_cluster( domain_list, job_index) except Exception as err: logger.error("%s: string_cluster_subpro: %s" % (module_name, str(err))) return cluster_list
def load_urls(file_path): """ 读取URL数据,csv格式 :param file_path: 读取路径 :return: """ try: df = pd.read_csv(file_path) urls = list(df.url) logger.info("%s: urls has been load\t%s" % (module_name, file_path)) return urls except Exception as err: logger.error("%s: urls load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def _core_preprocess(self, url): res = "" if not isinstance(url, str): logger.error("%s: vectorize: input url is not string" % (module_name)) return res url_obj = UrlNormalize(url) hostname = url_obj.get_hostname() if UrlPreprocess.check_domain( hostname) and not UrlPreprocess.check_ip(hostname): tld_obj = tldextract.extract(hostname) primary_domain = tld_obj.domain sub_domain = tld_obj.subdomain if primary_domain and sub_domain and sub_domain != "www": res = hostname return res
def dump_urls(urls, file_path): """ 保存URL数据,csv格式 :param urls: 待保存URL列表 :param file_path: 保存路径 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: df = pd.DataFrame({"url": urls}) df.to_csv(file_path, index=False) logger.info("%s: urls has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: urls dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def load_regex_list(file_path): """ 读取正则表达式列表 :param file_path: 读取路径 :return: 正则表达式列表 """ regex = [] try: with open(file_path, 'r') as fd: for line in fd: regex.append(line.strip()) logger.info("%s: regex data has been load\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: regex load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE) return regex
def do_get_malice_link(self, output_path): try: self.connect_db() self.get_malice_link() except Exception as err: logger.error( "%s: publish_domain_regex: do_publish_domain_regex: %s" % (module_name, err)) sys.exit(1) malice_link_list = list() for data_dict in self.data[0]: try: malice_link_list.append(data_dict[u"url"]) except Exception as err: logger.error( "%s: publish_domain_regex: do_publish_domain_regex: data format error %s %s" % (module_name, str(data_dict), err)) dump_urls(malice_link_list, output_path)
def load_cluster_data(file_path): """ 读取聚类结果数据,json格式 :param file_path: 读取路径 :return: 聚类结果,列表 """ cluster = [] try: with open(file_path, 'r') as fd: for line in fd: cluster.append(json.loads(line.strip())) logger.info("%s: cluster data has been load\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: cluster data load error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE) cluster = [_[_.keys()[0]] for _ in cluster] return cluster
def dump_cluster_data(file_path, cluster_list): """ 保存聚类结果数据,json格式 :param file_path: 保存路径 :param cluster_list: 聚类结果,列表 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: with open(file_path, "w") as fd: for index, cluster in enumerate(cluster_list): fd.write(json.dumps({index: cluster}) + '\n') logger.info("%s: cluster data has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: cluster data dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def dump_regex_list(regex_list, file_path): """ 保存正则表达式列表 :param file_path: 保存路径 :param regex_list: 正则表达式列表 :return: """ if os.path.isfile(file_path): os.remove(file_path) try: with open(file_path, 'w') as fd: for regex in regex_list: fd.write(regex + '\n') logger.info("%s: regex data has been dump\t%s" % (module_name, file_path)) except Exception as err: logger.error("%s: regex dump error %s %s" % (module_name, file_path, str(err))) sys.exit(SYSTME_ERROR_CODE)
def do_url_preprocess(self, url_list): """ URL训练预处理 :param url_list: :return: """ domain_list = list() st_time = time.time() for url in url_list: try: domain_list.append(self._core_preprocess(url)) except Exception as err: logger.error("%s: preprocess: preprocess url %s error %s" % (module_name, url, str(err))) domain_list = list(set([domain for domain in domain_list if domain])) end_time = time.time() logger.info( "%s: [statistic] url preprocess time cost:%f\tdomain count:%d" % (module_name, (end_time - st_time), len(domain_list))) return domain_list
def _core_distance_check(self, domain_0, domain_1): """ 判断两个字符串是否相似 :param domain_0: base url string to compare :param domain_1: second url string to compare with base url string :return: true of false whether the comp_url is similar with base url """ if not isinstance(domain_0, str) or not isinstance(domain_1, str): logger.error( "%s:_core_distance_chec: domain_0/domain_1 should be str str not %s %s" % (module_name, str(type(domain_0)), str(type(domain_1)))) distance_thresh = 0 url_length = len(domain_0) if url_length < SHORT_URL_THRESH: return False if SHORT_URL_THRESH <= url_length < LONG_URL_THRESH: distance_thresh = int(url_length * EDIT_DISTANCE_THRESH_SHORT) if LONG_URL_THRESH <= url_length: distance_thresh = int(url_length * EDIT_DISTANCE_THRESH_LONG) if ls.distance(domain_0, domain_1) <= distance_thresh: return True return False
def build_domain_token_tree(self, domain_list): """ 构建域名token树 :param domain_list: :return: """ # 过滤域名列表 domain_list = self.__filter_domain_list(domain_list) # 进行token统计分析 token_dict, token_tree = self.domain_token_analyze(domain_list) for level in token_dict: # 对无法抽出高频词的token列表进行正则表达式抽取 if level not in token_tree.keys(): token_list = token_dict[level] score_list, regex_list = list(), list() for sample_round in range(DOMAIN_TOKEN_SAMPLE_ROUND): sample_num = int(len(token_list) * DOMAIN_TOKEN_SAMPLE_RATIO) if sample_num > DOMAIN_TOKEN_SAMPLE_UPBOUND: sample_num = DOMAIN_TOKEN_SAMPLE_UPBOUND if sample_num <= DOMAIN_TOKEN_SAMPLE_LOWBOUND: sample_num = len(token_list) token_sample = random.sample(token_list, sample_num) try: regex = self.string_regex_extract(token_sample) except Exception as err: regex = "[^\.]{%d,%d}" \ % (min([len(token) for token in token_sample]), max([len(token) for token in token_sample])) logger.error("%s: regex: build_domain_token_tree: %s" % (module_name, str(err))) regex_list.append(regex) score_list.append(sum([self.__domain_token_regex_match(regex, token) for token in token_list])) max_score_index = score_list.index(max(score_list)) regex = regex_list[max_score_index] token_tree[level].append(regex) return token_tree