def __get_report(self, train_path, dev_path, test_path, model_list=[], train=True): model_list = self.__check_model_list(model_list) # load data df_train = load_df(train_path) df_dev = load_df(dev_path) df_test = load_df(test_path) # train or eval all model self.all_report = [] for model_name in tqdm(model_list): try: # get model model = self.__get_one_model(model_name, df_train, df_dev, df_test, train=train) # get dev/test report dev_report = self.__evaluate_one_model(model, df_dev, model_name, "dev") test_report = self.__evaluate_one_model( model, df_test, model_name, "test") # append report to list self.all_report.append(dev_report) self.all_report.append(test_report) # release model.release() print( "model_name:{} eval finish!,dev_report:{},test_report:{}". format(model_name, dev_report, test_report)) except: print("model_name:{},fail,detail is {}".format( model_name, traceback.format_exc())) if self.num_labels == 2: df_report = pd.DataFrame(self.all_report) cols = [ "Accuracy", "Precision", "Recall", "F_meansure", "AUC_Value", "avg_time_s" ] df_report_table = df_report.pivot_table( index=["data_set", "model_name"], values=cols)[cols] else: df_report_table = pd.concat(self.all_report) return df_report_table
def _get_torch_data_loader(self, s_data_excel_path): # s_msg = "get torch data loader for %s" % s_data_excel_path # self._run_logger and self._run_logger.info(s_msg) # self._b_print and print(s_msg) df = load_df(s_data_excel_path) ls_label = df["label"] ls_text = [e if isinstance(e, str) else str(e) for e in df["text"]] # 文本转tokens s_msg = "text len %d" % len(ls_text) self._run_logger and self._run_logger.info(s_msg) self._b_print and print(s_msg) ls_token_all = self._text_2_tokens(ls_text) # tokens得到一段文本嵌入矩阵 [seq_len, embed_dim] ln_id_all = self._token_2_id(ls_token_all) l_label_id = [(e1, e2) for e1, e2 in zip(ls_label, ln_id_all)] data_set = TorchDataSet(l_label_id) data_loader = DataLoader(dataset=data_set, batch_size=self.batch_size, shuffle=True) s_msg = "load text num %d" % len(l_label_id) self._run_logger and self._run_logger.info(s_msg) self._b_print and print(s_msg) return data_loader, df
def evaluate(self, df_path, model_list): model_list = self.__check_model_list(model_list) train = False df = load_df(df_path) all_report = [] for model_name in tqdm(model_list): try: # get model model = self.__get_one_model(model_name, df_train=None, df_dev=None, df_test=None, train=train) model_report = self.__evaluate_one_model( model, df, model_name, "") all_report.append(model_report) # release model.release() print("model_name:{} eval finish!,model_report:{}".format( model_name, model_report)) except: print("model_name:{},fail,detail is {}".format( model_name, traceback.format_exc())) if self.num_labels == 2: cols = [ "model_name", "Accuracy", "Precision", "Recall", "F_meansure", "AUC_Value", "avg_time_s" ] df_report = pd.DataFrame(all_report)[cols] else: df_report = pd.concat(all_report) return df_report
def evaluate(self, df): df = load_df(df) y_pred = self.demo_text_list(df['text'].tolist()) y_pred = np.array(y_pred) y_true = df['label'] if self.num_labels == 2: report = get_model_metrics(y_true, y_pred) else: report = get_multi_class_report(y_true, y_pred) return report
def __evaluate_one_model(self, model, df, model_name, data_set): df = load_df(df) # add time tic = time.time() y_pred = model.demo_text_list(df['text'].tolist()) y_pred = np.array(y_pred) toc = time.time() # cal avg time avg_time_s = (toc - tic) / df.shape[0] # get report y_true = df['label'] if self.num_labels == 2: report = get_model_metrics(y_true, y_pred) else: report = get_multi_class_report(y_true, y_pred) report['model_name'] = model_name report['data_set'] = data_set report['avg_time_s'] = avg_time_s return report
def get_list_result(self, df_list, model_list): '''获取所有模型的输出结果''' model_list = self.__check_model_list(model_list) train = False df_list = [load_df(x) for x in df_list] for model_name in tqdm(model_list): # get model try: model = self.__get_one_model(model_name, df_train=None, df_dev=None, df_test=None, train=train) for df in df_list: df[model_name] = model.predict_list(df['text'].tolist()) # release model.release() except: print("model_name:{},fail,detail is {}".format( model_name, traceback.format_exc())) return df_list
def load_data_contrastive(self, path): df = load_df(path) D = [] for text, text_neg, label in zip(df['text'], df['text_neg'], df['label']): D.append((str(text), str(text_neg), int(label))) return D
def load_data(self, path): df = load_df(path) D = [] for text, label in zip(df['text'], df['label']): D.append((str(text), int(label))) return D
def process_one_data(self, path): df = load_df(path) x = self.emb.get_sentence_list_emb_mean( df['text'].tolist(), max_len=self.max_len) y = df['label'] return x, y