Esempio n. 1
0
    def init(self, system, subclass):
        conn = self.data_processor.connect_db(self.conf.db_host,
                                              self.conf.db_database,
                                              self.conf.db_user,
                                              self.conf.db_pass)
        #装载词表,#装载模型
        t = time.time()

        logger.debug("正在初始化[%s-%s]的模型加载", system, subclass)

        dic_name = "dictionary_" + system + "_" + subclass + ".dic"
        dictionary = Dictionary.load(self.model_dir + "/" + dic_name)
        logger.debug("加载了字典:%s", dic_name)
        logger.debug("词袋一共%d个词", len(dictionary.keys()))

        model_name = "tfidf_" + system + "_" + subclass + ".model"
        model = TfidfModel.load(self.model_dir + "/" + model_name)
        logger.debug("加载了TFIDF模型:%s", model_name)

        df_train = pd.read_sql(
            "select * from monitor_cluster_dbscan where business_system_code='{}' and rule_type_code='{}'"
            .format(system, subclass), conn)

        #KNN聚类,然后预测
        knn = self.get_KNN_model(df_train, dictionary, model)
        duration(t, "根据字典和此分类数据,基于tfidf向量,训练出KNN模型")

        if knn is not None:
            key = system + "-" + subclass
            value = {'model': model, 'dictionary': dictionary, 'knn': knn}
            self.models[key] = value
Esempio n. 2
0
    def process(self, con, df):
        big_class = self.data_processor.get_big_class(con)

        # logger.debug(df.columns)
        for _, each_big_class in big_class.iterrows():
            system = each_big_class['business_system_code']
            big_class = each_big_class['rule_type_code']

            t = time.time()

            # logger.debug("system:%s,big_class:%s",system,big_class)
            sub_df = df[(df.business_system_code == system)
                        & (df.rule_type_code == big_class)]

            corpus_vector, dictionary, model = self.data_processor.caculate_tfidf(
                sub_df)

            dic_name = "dictionary_" + system + "_" + big_class + ".dic"
            dictionary.save(self.model_dir + "/" + dic_name)
            logger.debug("保存词表:%s", dic_name)

            model_name = "tfidf_" + system + "_" + big_class + ".model"
            model.save(self.model_dir + "/" + model_name)
            logger.debug("保存TFIDF模型:%s", model_name)

            t = duration(t, "完成转化语料为tfidf向量!")

            if corpus_vector.shape[0] == 0:
                logger.error("这里一类(%s-%s)没有语料,忽略", system, big_class)
                continue

            classes, dbscan, _ = self.DBSCAN_analysis(corpus_vector)
            logger.debug("%s-%s完成DBSCAN聚类", system, big_class)
            t = duration(t, "DBSCAN聚类")

            sub_df['classes'] = classes
            # logger.debug("DBSCAN聚类 lables:%r",dbscan.labels_)
            # logger.debug("DBSCAN聚类 core_sample_indices_:%r",dbscan.core_sample_indices_)
            # logger.debug("DBSCAN聚类 get_params:%r",dbscan.get_params())
            sub_df.to_sql("monitor_cluster_dbscan",
                          con,
                          if_exists='append',
                          chunksize=100)
            # sub_df.sort_values(['classes'])

            #保存这一个大类的聚出来的类的信息==>monitor_classes
            self.generate_class_title(sub_df, system, big_class, con)
Esempio n. 3
0
    def predict(self, test_data, system, subclass):

        key = system + "-" + subclass
        logger.debug("预测开始,系统-大类为:%s", key)
        _tfidf_dictionary = self.models.get(key, None)

        if _tfidf_dictionary is None:
            logger.error("无法找到对应的tfidf和字典,KNN")
            return None

        model = _tfidf_dictionary.get("model")
        dictionary = _tfidf_dictionary.get("dictionary")
        knn = _tfidf_dictionary.get("knn")

        if (model is None):
            logger.error("无法找到TFIDF Model")
            return None
        if (dictionary is None):
            logger.error("无法找到dictionary")
            return None
        if (knn is None):
            logger.error("无法找到KNN")
            return None

        t = time.time()
        # 生成清洗完html标签,去掉了数字和英文,停顿词之后的语料,空格分割
        test_data = self.data_processor.process_line(test_data)
        logger.debug("对邮件进行完了处理:%s", test_data)

        x_test = self.data_processor.get_tfidf_vector([test_data], dictionary,
                                                      model)
        t = duration(t, "加载测试数据")

        #来正式预测
        pred = knn.predict(x_test)
        t = duration(t, "预测结果")
        logger.debug("预测结果:")
        logger.debug(pred)

        return pred[0]
Esempio n. 4
0
    def clean_process(self, df):
        t = time()

        pb = progressbar.ProgressBar(len(df))
        pb.start()

        MAX_LENGTH = 10000
        for index, row in df.iterrows():

            cut_text = self.process_line(row['email_html'])

            try:
                pb.update(index)
            except ValueError:
                pass

            if len(cut_text) > MAX_LENGTH: cut_text = cut_text[0:MAX_LENGTH]
            df.loc[index, 'html_cut'] = cut_text

            #每列遍历,都看看长度是否大于某个值,截断

            for k, v in row.iteritems():
                if v is None: continue
                if len(v) > MAX_LENGTH:
                    logger.warn("列[%r]的值长度[%d]大于[%d],截断", k, len(v),
                                MAX_LENGTH)
                    # row[k] = v[0:MAX_LENGTH]
                    df.loc[index, k] = v[0:MAX_LENGTH]

            # 进度条
            try:
                pb.update(index)
            except ValueError:
                pass
        pb.finish()
        duration(t, "清洗数据:去除HTML tag,去除无用字符")

        return df
Esempio n. 5
0
    def train(self, db_ip, db_name, user, passwd):

        t = time.time()

        #数据加载
        conn = self.data_processor.connect_db(db_ip, db_name, user, passwd)
        df = self.data_processor.load_data(conn)
        t = duration(t, "从数据库中加载数据")

        #数据清洗,主要是把邮件内容做清洗,去除html标签和停用词,然后分词
        df = self.data_processor.clean_process(df)

        #调用dbscan,对每个系统的每个大类进行分别聚类,然后将结果保存到db_monitor_classes中
        self.dbscan.process(conn, df)