def do_string_distance_cluster(self, domain_list, job_index):
        """
        对粗粒度聚类结果的每个簇进行进一步的字符串相似度聚类
        :param domain_list: 输入的domain列表
        :param job_index: 子进程编号:
        :return:
        """
        if not isinstance(domain_list, list):
            raise ValueError(
                "do_string_distance_cluster: domain_list should be list not %s"
                % str(type(domain_list)))

        cluster_list = list()
        cluster_done = set()
        # for each sub cluster of k-mean results
        for index, domain_0 in enumerate(domain_list):
            if domain_0 in cluster_done:
                continue
            cluster = [domain_0]
            cluster_done.add(domain_0)
            for domain_1 in domain_list[index + 1:]:
                if domain_1 not in cluster_done and self._core_distance_check(
                        domain_0, domain_1):
                    cluster.append(domain_1)
                    cluster_done.add(domain_1)
            cluster_list.append(cluster)
            if index % CLUSTER_REPORT_FQ == 0:
                logger.info("batch:%d %d/%d" %
                            (job_index, index, len(domain_list)))

        return cluster_list
    def do_publish_domain_regex(self, input_path):
        """
        对数据库中的正则表达式进行跟新
        :param input_path: 正则表达式文件路径
        :return:
        """
        regex_list = load_regex_list(input_path)
        try:
            self.connect_db()
        except Exception as err:
            logger.error("%s: publish_domain_regex: do_publish_domain_regex: %s" % (module_name, err))
            sys.exit(1)

        for regex in regex_list:
            data_dict = dict()
            data_dict["_id"] = self.std_md5(regex)
            if not self.table.get(data_dict):
                data_dict["regex"] = regex
                data_dict["add_time"] = str(datetime.datetime.now())
                data_dict["update_time"] = data_dict["add_time"]
                data_dict["source_type"] = malicious_type_malware
                response = self.table.add_data(data_dict)
            else:
                update_dict = dict()
                update_dict["update_time"] = str(datetime.datetime.now())
                response = self.table.update_data(data_dict, update_dict, False)

            if not response:
                logger.error("%s: publish_domain_regex: do_publish_domain_regex: \
                mongodb write error when try to write data %s" % (module_name, str(data_dict)))

        logger.info("domain regex data has been update")
    def make_kmeans_cluster(self, domain_list, n_cluster):
        """
        :param domain_list:
        :param n_cluster:
        :return:
        """
        # normalize input
        if not isinstance(domain_list, list):
            raise ValueError(
                "make_kmeans_cluster: input should be list not %s" %
                (str(type(domain_list))))

        if n_cluster <= 1:
            return [domain_list]

        vec_obj = DomainVectorize()
        df_vector = vec_obj.do_vectorize(domain_list)

        # make kmeans cluster
        kmeans = KMeans(n_clusters=n_cluster,
                        verbose=0,
                        random_state=0,
                        n_jobs=N_JOBS_CLUSTERING)
        kmeans.fit(df_vector.values)
        df_vector["labels"] = kmeans.labels_

        # dump results
        res = list()
        for index in range(n_cluster):
            df_cluster = df_vector.loc[df_vector["labels"] == index]
            res.append(list(df_cluster.index))
        logger.info("k-means cluster done!")
        return res
Example #4
0
def domain_regex_extract():
    # 检查路径
    train_urls_file = os.path.join(WORK_PATH, "train_urls.csv")
    cluster_file = os.path.join(WORK_PATH, "cluster_distance.json")
    domain_regex_file = os.path.join(WORK_PATH, "domain_regex.txt")

    if not os.path.isdir(WORK_PATH):
        os.makedirs(WORK_PATH)
        logger.info("mkdir %s" % WORK_PATH)

    # 获取恶意链接
#    get_data_obj = GetMaliceLink()
#    get_data_obj.do_get_malice_link(train_urls_file)
#    malice_link_list = load_urls(train_urls_file)
#    logger.info("%s: main: malice_link get" % (module_name))
    malice_link_list = load_urls("../std_data/mal.csv")

    # 进行预处理
    process_obj = UrlPreprocess()
    malice_domain_list = process_obj.do_url_preprocess(malice_link_list)
    logger.info("%s: main: preprocess complete" % (module_name))

    # 域名聚类
    cluster_obj = DomainCluster()
    cluster_obj.do_make_domain_clustering(malice_domain_list , cluster_file )
    logger.info("%s: main: clustering complete" % (module_name))

    # 正则表达式抽取
    extract_obj = DomainRegexExtract()
    extract_obj.do_domain_regex_extract(cluster_file, domain_regex_file)
    logger.info("%s: main: regex extract complete" % (module_name))
Example #5
0
def load_urls(file_path):
    """
    读取URL数据,csv格式
    :param file_path: 读取路径
    :return:
    """
    try:
        df = pd.read_csv(file_path)
        urls = list(df.url)
        logger.info("%s: urls has been load\t%s" % (module_name, file_path))
        return urls
    except Exception as err:
        logger.error("%s: urls load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
Example #6
0
 def build_domain_regex(self, token_tree):
     """
     将toke tree拼装为正则表达式
     :param token_tree:
     :return:
     """
     token_regex_list = []
     for level in range(len(token_tree) - 1, -1, -1):
         token_regex = "|".join(token_tree[level])
         if len(token_tree[level]) == 1:
             token_regex_list.append(token_regex)
         else:
             token_regex_list.append("".join(["(:?", token_regex, ")"]))
     domain_regex = "\.".join(token_regex_list)
     domain_regex = "".join(["^", domain_regex, "$"])
     logger.info("%s: raw regex %s" % (module_name, domain_regex))
     return domain_regex
Example #7
0
def dump_urls(urls, file_path):
    """
    保存URL数据,csv格式
    :param urls: 待保存URL列表
    :param file_path: 保存路径
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        df = pd.DataFrame({"url": urls})
        df.to_csv(file_path, index=False)
        logger.info("%s: urls has been dump\t%s" % (module_name, file_path))
    except Exception as err:
        logger.error("%s: urls dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
Example #8
0
def load_regex_list(file_path):
    """
    读取正则表达式列表
    :param file_path: 读取路径
    :return: 正则表达式列表
    """
    regex = []
    try:
        with open(file_path, 'r') as fd:
            for line in fd:
                regex.append(line.strip())
        logger.info("%s: regex data has been load\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: regex load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
    return regex
Example #9
0
def load_cluster_data(file_path):
    """
    读取聚类结果数据,json格式
    :param file_path: 读取路径
    :return: 聚类结果,列表
    """
    cluster = []
    try:
        with open(file_path, 'r') as fd:
            for line in fd:
                cluster.append(json.loads(line.strip()))
        logger.info("%s: cluster data has been load\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: cluster data load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
    cluster = [_[_.keys()[0]] for _ in cluster]
    return cluster
Example #10
0
def dump_cluster_data(file_path, cluster_list):
    """
    保存聚类结果数据,json格式
    :param file_path: 保存路径
    :param cluster_list: 聚类结果,列表
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        with open(file_path, "w") as fd:
            for index, cluster in enumerate(cluster_list):
                fd.write(json.dumps({index: cluster}) + '\n')
        logger.info("%s: cluster data has been dump\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: cluster data dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
Example #11
0
def dump_regex_list(regex_list, file_path):
    """
    保存正则表达式列表
    :param file_path:  保存路径
    :param regex_list: 正则表达式列表
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        with open(file_path, 'w') as fd:
            for regex in regex_list:
                fd.write(regex + '\n')
        logger.info("%s: regex data has been dump\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: regex dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
    def do_vectorize(self, domain_list):
        """
        :param domain_list:
        :return:
        """
        if not isinstance(domain_list, list):
            raise ValueError("input must list not %s" % str(type(domain_list)))

        res = list()
        for index, domain in enumerate(domain_list):
            res.append(self._core_vectorize(domain))
        res = np.concatenate(res, axis=0)

        df = pd.DataFrame(res)
        df['domain'] = domain_list
        df = df.set_index('domain')

        logger.info("%s: vectorize: vectorization complete! data shape:\t%s" %
                    (module_name, str(df.values.shape)))
        return df
 def do_url_preprocess(self, url_list):
     """
     URL训练预处理
     :param url_list:
     :return:
     """
     domain_list = list()
     st_time = time.time()
     for url in url_list:
         try:
             domain_list.append(self._core_preprocess(url))
         except Exception as err:
             logger.error("%s: preprocess: preprocess url %s error %s" %
                          (module_name, url, str(err)))
     domain_list = list(set([domain for domain in domain_list if domain]))
     end_time = time.time()
     logger.info(
         "%s: [statistic] url preprocess time cost:%f\tdomain count:%d" %
         (module_name, (end_time - st_time), len(domain_list)))
     return domain_list
    def do_make_domain_clustering(self, domain_list, output_path):
        """
        :param domain_list:
        :param output_path:
        :return:
        """
        st_time = time.time()
        if not isinstance(domain_list, list):
            raise ValueError(
                "do_make_domain_clustering: should be list not %s" %
                (str(type(domain_list))))

        preliminary_n_cluster = len(domain_list) / KMEANS_SIZE_LIMIT + 1
        cluster_list = self.make_kmeans_cluster(
            domain_list, n_cluster=preliminary_n_cluster)
        logger.info("Preliminary K-means clustering complete")
        while True:
            if self._core_check_cluster_size(cluster_list):
                break
        self.make_string_distance_cluster(cluster_list, output_path)
        end_time = time.time()
        logger.info("%s: [statistic] domain clustering time cost:%f" %
                    (module_name, (end_time - st_time)))
Example #15
0
 def show(self):
     '''
     显示所有数据
     '''
     for key in self.__dict__:
         logger.info('%s : %s', key, str(self.__dict__[key]))
Example #16
0
    def do_domain_regex_extract(self, input_path, output_path):
        """
        域名正则表达抽取主函数
        :param input_path:
        :param output_path:
        :return:
        """
        st_time = time.time()
        # 对聚类结果进行统计并写入日志
        cluster_list = load_cluster_data(input_path)
        size_list = [len(cluster) for cluster in cluster_list]
        logger.info("%s: total cluster num:\t%d" % (module_name, len(cluster_list)))
        logger.info("%s: single one:\t%d" % (module_name, len([1 for size in size_list if size == 1])))
        logger.info("%s: small cluster:\t%d" %
                    (module_name, len([1 for size in size_list if SMALL_CLUSTER_SIZE <= size < BIG_CLUSTER_SIZE])))
        logger.info("%s: big cluster:\t%d" %
                    (module_name, len([1 for size in size_list if size >= BIG_CLUSTER_SIZE])))

        # 过滤过小的簇
        cluster_list = [cluster for cluster in cluster_list if len(cluster) >= DOMAIN_CLUSTER_SIZE_THRESH]

        # 抽取正则表达式
        domain_token_tree_list = [self.build_domain_token_tree(cluster) for cluster in cluster_list]
        domain_regex_list = list(set([self.build_domain_regex(token_tree) for token_tree in domain_token_tree_list]))
        domain_regex_list = self.domain_regex_deduplicate(domain_regex_list)
        end_time = time.time()

        # 将结果写入日志
        logger.info("%s: extract regex count:\t%d" % (module_name, len(domain_regex_list)))
        logger.info("%s: [statistic] domain regex extract time cost:%f" % (module_name, (end_time - st_time)))
        dump_regex_list(domain_regex_list, output_path)