def init(self, system, subclass): conn = self.data_processor.connect_db(self.conf.db_host, self.conf.db_database, self.conf.db_user, self.conf.db_pass) #装载词表,#装载模型 t = time.time() logger.debug("正在初始化[%s-%s]的模型加载", system, subclass) dic_name = "dictionary_" + system + "_" + subclass + ".dic" dictionary = Dictionary.load(self.model_dir + "/" + dic_name) logger.debug("加载了字典:%s", dic_name) logger.debug("词袋一共%d个词", len(dictionary.keys())) model_name = "tfidf_" + system + "_" + subclass + ".model" model = TfidfModel.load(self.model_dir + "/" + model_name) logger.debug("加载了TFIDF模型:%s", model_name) df_train = pd.read_sql( "select * from monitor_cluster_dbscan where business_system_code='{}' and rule_type_code='{}'" .format(system, subclass), conn) #KNN聚类,然后预测 knn = self.get_KNN_model(df_train, dictionary, model) duration(t, "根据字典和此分类数据,基于tfidf向量,训练出KNN模型") if knn is not None: key = system + "-" + subclass value = {'model': model, 'dictionary': dictionary, 'knn': knn} self.models[key] = value
def process(self, con, df): big_class = self.data_processor.get_big_class(con) # logger.debug(df.columns) for _, each_big_class in big_class.iterrows(): system = each_big_class['business_system_code'] big_class = each_big_class['rule_type_code'] t = time.time() # logger.debug("system:%s,big_class:%s",system,big_class) sub_df = df[(df.business_system_code == system) & (df.rule_type_code == big_class)] corpus_vector, dictionary, model = self.data_processor.caculate_tfidf( sub_df) dic_name = "dictionary_" + system + "_" + big_class + ".dic" dictionary.save(self.model_dir + "/" + dic_name) logger.debug("保存词表:%s", dic_name) model_name = "tfidf_" + system + "_" + big_class + ".model" model.save(self.model_dir + "/" + model_name) logger.debug("保存TFIDF模型:%s", model_name) t = duration(t, "完成转化语料为tfidf向量!") if corpus_vector.shape[0] == 0: logger.error("这里一类(%s-%s)没有语料,忽略", system, big_class) continue classes, dbscan, _ = self.DBSCAN_analysis(corpus_vector) logger.debug("%s-%s完成DBSCAN聚类", system, big_class) t = duration(t, "DBSCAN聚类") sub_df['classes'] = classes # logger.debug("DBSCAN聚类 lables:%r",dbscan.labels_) # logger.debug("DBSCAN聚类 core_sample_indices_:%r",dbscan.core_sample_indices_) # logger.debug("DBSCAN聚类 get_params:%r",dbscan.get_params()) sub_df.to_sql("monitor_cluster_dbscan", con, if_exists='append', chunksize=100) # sub_df.sort_values(['classes']) #保存这一个大类的聚出来的类的信息==>monitor_classes self.generate_class_title(sub_df, system, big_class, con)
def predict(self, test_data, system, subclass): key = system + "-" + subclass logger.debug("预测开始,系统-大类为:%s", key) _tfidf_dictionary = self.models.get(key, None) if _tfidf_dictionary is None: logger.error("无法找到对应的tfidf和字典,KNN") return None model = _tfidf_dictionary.get("model") dictionary = _tfidf_dictionary.get("dictionary") knn = _tfidf_dictionary.get("knn") if (model is None): logger.error("无法找到TFIDF Model") return None if (dictionary is None): logger.error("无法找到dictionary") return None if (knn is None): logger.error("无法找到KNN") return None t = time.time() # 生成清洗完html标签,去掉了数字和英文,停顿词之后的语料,空格分割 test_data = self.data_processor.process_line(test_data) logger.debug("对邮件进行完了处理:%s", test_data) x_test = self.data_processor.get_tfidf_vector([test_data], dictionary, model) t = duration(t, "加载测试数据") #来正式预测 pred = knn.predict(x_test) t = duration(t, "预测结果") logger.debug("预测结果:") logger.debug(pred) return pred[0]
def clean_process(self, df): t = time() pb = progressbar.ProgressBar(len(df)) pb.start() MAX_LENGTH = 10000 for index, row in df.iterrows(): cut_text = self.process_line(row['email_html']) try: pb.update(index) except ValueError: pass if len(cut_text) > MAX_LENGTH: cut_text = cut_text[0:MAX_LENGTH] df.loc[index, 'html_cut'] = cut_text #每列遍历,都看看长度是否大于某个值,截断 for k, v in row.iteritems(): if v is None: continue if len(v) > MAX_LENGTH: logger.warn("列[%r]的值长度[%d]大于[%d],截断", k, len(v), MAX_LENGTH) # row[k] = v[0:MAX_LENGTH] df.loc[index, k] = v[0:MAX_LENGTH] # 进度条 try: pb.update(index) except ValueError: pass pb.finish() duration(t, "清洗数据:去除HTML tag,去除无用字符") return df
def train(self, db_ip, db_name, user, passwd): t = time.time() #数据加载 conn = self.data_processor.connect_db(db_ip, db_name, user, passwd) df = self.data_processor.load_data(conn) t = duration(t, "从数据库中加载数据") #数据清洗,主要是把邮件内容做清洗,去除html标签和停用词,然后分词 df = self.data_processor.clean_process(df) #调用dbscan,对每个系统的每个大类进行分别聚类,然后将结果保存到db_monitor_classes中 self.dbscan.process(conn, df)