def do_publish_domain_regex(self, input_path):
        """
        对数据库中的正则表达式进行跟新
        :param input_path: 正则表达式文件路径
        :return:
        """
        regex_list = load_regex_list(input_path)
        try:
            self.connect_db()
        except Exception as err:
            logger.error("%s: publish_domain_regex: do_publish_domain_regex: %s" % (module_name, err))
            sys.exit(1)

        for regex in regex_list:
            data_dict = dict()
            data_dict["_id"] = self.std_md5(regex)
            if not self.table.get(data_dict):
                data_dict["regex"] = regex
                data_dict["add_time"] = str(datetime.datetime.now())
                data_dict["update_time"] = data_dict["add_time"]
                data_dict["source_type"] = malicious_type_malware
                response = self.table.add_data(data_dict)
            else:
                update_dict = dict()
                update_dict["update_time"] = str(datetime.datetime.now())
                response = self.table.update_data(data_dict, update_dict, False)

            if not response:
                logger.error("%s: publish_domain_regex: do_publish_domain_regex: \
                mongodb write error when try to write data %s" % (module_name, str(data_dict)))

        logger.info("domain regex data has been update")
 def _core_vectorize(self, domain):
     res = np.zeros((1, ASCII_SIZE), dtype=np.int32)
     if not isinstance(domain, str):
         logger.error("%s: vectorize: input url is not string" %
                      (module_name))
         return res
     for char in domain:
         if ord(char) < ASCII_SIZE:
             res[0][ord(char)] += 1
     return res
def string_cluster_subpro(domain_list, job_index):
    """
    字符串聚类子进程
    :param domain_list 域名簇:
    :param job_index: 子进程编号
    :return:
    """
    str_cluster_obj = StringClustering()
    cluster_list = list()
    try:
        cluster_list = str_cluster_obj.do_string_distance_cluster(
            domain_list, job_index)
    except Exception as err:
        logger.error("%s: string_cluster_subpro: %s" % (module_name, str(err)))
    return cluster_list
Example #4
0
def load_urls(file_path):
    """
    读取URL数据,csv格式
    :param file_path: 读取路径
    :return:
    """
    try:
        df = pd.read_csv(file_path)
        urls = list(df.url)
        logger.info("%s: urls has been load\t%s" % (module_name, file_path))
        return urls
    except Exception as err:
        logger.error("%s: urls load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
 def _core_preprocess(self, url):
     res = ""
     if not isinstance(url, str):
         logger.error("%s: vectorize: input url is not string" %
                      (module_name))
         return res
     url_obj = UrlNormalize(url)
     hostname = url_obj.get_hostname()
     if UrlPreprocess.check_domain(
             hostname) and not UrlPreprocess.check_ip(hostname):
         tld_obj = tldextract.extract(hostname)
         primary_domain = tld_obj.domain
         sub_domain = tld_obj.subdomain
         if primary_domain and sub_domain and sub_domain != "www":
             res = hostname
     return res
Example #6
0
def dump_urls(urls, file_path):
    """
    保存URL数据,csv格式
    :param urls: 待保存URL列表
    :param file_path: 保存路径
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        df = pd.DataFrame({"url": urls})
        df.to_csv(file_path, index=False)
        logger.info("%s: urls has been dump\t%s" % (module_name, file_path))
    except Exception as err:
        logger.error("%s: urls dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
Example #7
0
def load_regex_list(file_path):
    """
    读取正则表达式列表
    :param file_path: 读取路径
    :return: 正则表达式列表
    """
    regex = []
    try:
        with open(file_path, 'r') as fd:
            for line in fd:
                regex.append(line.strip())
        logger.info("%s: regex data has been load\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: regex load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
    return regex
Example #8
0
 def do_get_malice_link(self, output_path):
     try:
         self.connect_db()
         self.get_malice_link()
     except Exception as err:
         logger.error(
             "%s: publish_domain_regex: do_publish_domain_regex: %s" %
             (module_name, err))
         sys.exit(1)
     malice_link_list = list()
     for data_dict in self.data[0]:
         try:
             malice_link_list.append(data_dict[u"url"])
         except Exception as err:
             logger.error(
                 "%s: publish_domain_regex: do_publish_domain_regex: data format error %s %s"
                 % (module_name, str(data_dict), err))
     dump_urls(malice_link_list, output_path)
Example #9
0
def load_cluster_data(file_path):
    """
    读取聚类结果数据,json格式
    :param file_path: 读取路径
    :return: 聚类结果,列表
    """
    cluster = []
    try:
        with open(file_path, 'r') as fd:
            for line in fd:
                cluster.append(json.loads(line.strip()))
        logger.info("%s: cluster data has been load\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: cluster data load error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
    cluster = [_[_.keys()[0]] for _ in cluster]
    return cluster
Example #10
0
def dump_cluster_data(file_path, cluster_list):
    """
    保存聚类结果数据,json格式
    :param file_path: 保存路径
    :param cluster_list: 聚类结果,列表
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        with open(file_path, "w") as fd:
            for index, cluster in enumerate(cluster_list):
                fd.write(json.dumps({index: cluster}) + '\n')
        logger.info("%s: cluster data has been dump\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: cluster data dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
Example #11
0
def dump_regex_list(regex_list, file_path):
    """
    保存正则表达式列表
    :param file_path:  保存路径
    :param regex_list: 正则表达式列表
    :return:
    """
    if os.path.isfile(file_path):
        os.remove(file_path)
    try:
        with open(file_path, 'w') as fd:
            for regex in regex_list:
                fd.write(regex + '\n')
        logger.info("%s: regex data has been dump\t%s" %
                    (module_name, file_path))
    except Exception as err:
        logger.error("%s: regex dump error %s %s" %
                     (module_name, file_path, str(err)))
        sys.exit(SYSTME_ERROR_CODE)
 def do_url_preprocess(self, url_list):
     """
     URL训练预处理
     :param url_list:
     :return:
     """
     domain_list = list()
     st_time = time.time()
     for url in url_list:
         try:
             domain_list.append(self._core_preprocess(url))
         except Exception as err:
             logger.error("%s: preprocess: preprocess url %s error %s" %
                          (module_name, url, str(err)))
     domain_list = list(set([domain for domain in domain_list if domain]))
     end_time = time.time()
     logger.info(
         "%s: [statistic] url preprocess time cost:%f\tdomain count:%d" %
         (module_name, (end_time - st_time), len(domain_list)))
     return domain_list
    def _core_distance_check(self, domain_0, domain_1):
        """
        判断两个字符串是否相似
        :param domain_0: base url string to compare
        :param domain_1: second url string to compare with base url string
        :return: true of false whether the comp_url is similar with base url
        """
        if not isinstance(domain_0, str) or not isinstance(domain_1, str):
            logger.error(
                "%s:_core_distance_chec: domain_0/domain_1 should be str str not %s %s"
                % (module_name, str(type(domain_0)), str(type(domain_1))))

        distance_thresh = 0
        url_length = len(domain_0)
        if url_length < SHORT_URL_THRESH:
            return False
        if SHORT_URL_THRESH <= url_length < LONG_URL_THRESH:
            distance_thresh = int(url_length * EDIT_DISTANCE_THRESH_SHORT)
        if LONG_URL_THRESH <= url_length:
            distance_thresh = int(url_length * EDIT_DISTANCE_THRESH_LONG)
        if ls.distance(domain_0, domain_1) <= distance_thresh:
            return True
        return False
Example #14
0
    def build_domain_token_tree(self, domain_list):
        """
        构建域名token树
        :param domain_list:
        :return:
        """
        # 过滤域名列表
        domain_list = self.__filter_domain_list(domain_list)
        # 进行token统计分析
        token_dict, token_tree = self.domain_token_analyze(domain_list)
        for level in token_dict:
            # 对无法抽出高频词的token列表进行正则表达式抽取
            if level not in token_tree.keys():
                token_list = token_dict[level]
                score_list, regex_list = list(), list()
                for sample_round in range(DOMAIN_TOKEN_SAMPLE_ROUND):
                    sample_num = int(len(token_list) * DOMAIN_TOKEN_SAMPLE_RATIO)
                    if sample_num > DOMAIN_TOKEN_SAMPLE_UPBOUND:
                        sample_num = DOMAIN_TOKEN_SAMPLE_UPBOUND
                    if sample_num <= DOMAIN_TOKEN_SAMPLE_LOWBOUND:
                        sample_num = len(token_list)
                    token_sample = random.sample(token_list, sample_num)

                    try:
                        regex = self.string_regex_extract(token_sample)
                    except Exception as err:
                        regex = "[^\.]{%d,%d}" \
                            % (min([len(token) for token in token_sample]), max([len(token) for token in token_sample]))
                        logger.error("%s: regex: build_domain_token_tree: %s" % (module_name, str(err)))

                    regex_list.append(regex)
                    score_list.append(sum([self.__domain_token_regex_match(regex, token) for token in token_list]))
                max_score_index = score_list.index(max(score_list))
                regex = regex_list[max_score_index]
                token_tree[level].append(regex)
        return token_tree