Ejemplo n.º 1
0
 def __get_report(self,
                  train_path,
                  dev_path,
                  test_path,
                  model_list=[],
                  train=True):
     model_list = self.__check_model_list(model_list)
     # load data
     df_train = load_df(train_path)
     df_dev = load_df(dev_path)
     df_test = load_df(test_path)
     # train or eval all model
     self.all_report = []
     for model_name in tqdm(model_list):
         try:
             # get model
             model = self.__get_one_model(model_name,
                                          df_train,
                                          df_dev,
                                          df_test,
                                          train=train)
             # get dev/test report
             dev_report = self.__evaluate_one_model(model, df_dev,
                                                    model_name, "dev")
             test_report = self.__evaluate_one_model(
                 model, df_test, model_name, "test")
             # append report to list
             self.all_report.append(dev_report)
             self.all_report.append(test_report)
             # release
             model.release()
             print(
                 "model_name:{} eval finish!,dev_report:{},test_report:{}".
                 format(model_name, dev_report, test_report))
         except:
             print("model_name:{},fail,detail is {}".format(
                 model_name, traceback.format_exc()))
     if self.num_labels == 2:
         df_report = pd.DataFrame(self.all_report)
         cols = [
             "Accuracy", "Precision", "Recall", "F_meansure", "AUC_Value",
             "avg_time_s"
         ]
         df_report_table = df_report.pivot_table(
             index=["data_set", "model_name"], values=cols)[cols]
     else:
         df_report_table = pd.concat(self.all_report)
     return df_report_table
Ejemplo n.º 2
0
    def _get_torch_data_loader(self, s_data_excel_path):
        # s_msg = "get torch data loader for %s" % s_data_excel_path
        # self._run_logger and self._run_logger.info(s_msg)
        # self._b_print and print(s_msg)
        
        df = load_df(s_data_excel_path)
        ls_label = df["label"]
        ls_text = [e if isinstance(e, str) else str(e) for e in df["text"]]
        
        # 文本转tokens
        s_msg = "text len %d" % len(ls_text)
        self._run_logger and self._run_logger.info(s_msg)
        self._b_print and print(s_msg)

        ls_token_all = self._text_2_tokens(ls_text)

        # tokens得到一段文本嵌入矩阵 [seq_len, embed_dim]
        ln_id_all = self._token_2_id(ls_token_all)

        l_label_id = [(e1, e2) for e1, e2 in zip(ls_label, ln_id_all)]
        
        data_set = TorchDataSet(l_label_id)
        data_loader = DataLoader(dataset=data_set, batch_size=self.batch_size, shuffle=True)

        s_msg = "load text num %d" % len(l_label_id)
        self._run_logger and self._run_logger.info(s_msg)
        self._b_print and print(s_msg)

        return data_loader, df
Ejemplo n.º 3
0
 def evaluate(self, df_path, model_list):
     model_list = self.__check_model_list(model_list)
     train = False
     df = load_df(df_path)
     all_report = []
     for model_name in tqdm(model_list):
         try:
             # get model
             model = self.__get_one_model(model_name,
                                          df_train=None,
                                          df_dev=None,
                                          df_test=None,
                                          train=train)
             model_report = self.__evaluate_one_model(
                 model, df, model_name, "")
             all_report.append(model_report)
             # release
             model.release()
             print("model_name:{} eval finish!,model_report:{}".format(
                 model_name, model_report))
         except:
             print("model_name:{},fail,detail is {}".format(
                 model_name, traceback.format_exc()))
     if self.num_labels == 2:
         cols = [
             "model_name", "Accuracy", "Precision", "Recall", "F_meansure",
             "AUC_Value", "avg_time_s"
         ]
         df_report = pd.DataFrame(all_report)[cols]
     else:
         df_report = pd.concat(all_report)
     return df_report
 def evaluate(self, df):
     df = load_df(df)
     y_pred = self.demo_text_list(df['text'].tolist())
     y_pred = np.array(y_pred)
     y_true = df['label']
     if self.num_labels == 2:
         report = get_model_metrics(y_true, y_pred)
     else:
         report = get_multi_class_report(y_true, y_pred)
     return report
Ejemplo n.º 5
0
 def __evaluate_one_model(self, model, df, model_name, data_set):
     df = load_df(df)
     # add time
     tic = time.time()
     y_pred = model.demo_text_list(df['text'].tolist())
     y_pred = np.array(y_pred)
     toc = time.time()
     # cal avg time
     avg_time_s = (toc - tic) / df.shape[0]
     # get report
     y_true = df['label']
     if self.num_labels == 2:
         report = get_model_metrics(y_true, y_pred)
     else:
         report = get_multi_class_report(y_true, y_pred)
     report['model_name'] = model_name
     report['data_set'] = data_set
     report['avg_time_s'] = avg_time_s
     return report
Ejemplo n.º 6
0
 def get_list_result(self, df_list, model_list):
     '''获取所有模型的输出结果'''
     model_list = self.__check_model_list(model_list)
     train = False
     df_list = [load_df(x) for x in df_list]
     for model_name in tqdm(model_list):
         # get model
         try:
             model = self.__get_one_model(model_name,
                                          df_train=None,
                                          df_dev=None,
                                          df_test=None,
                                          train=train)
             for df in df_list:
                 df[model_name] = model.predict_list(df['text'].tolist())
             # release
             model.release()
         except:
             print("model_name:{},fail,detail is {}".format(
                 model_name, traceback.format_exc()))
     return df_list
 def load_data_contrastive(self, path):
     df = load_df(path)
     D = []
     for text, text_neg, label in zip(df['text'], df['text_neg'], df['label']):
         D.append((str(text), str(text_neg), int(label)))
     return D
 def load_data(self, path):
     df = load_df(path)
     D = []
     for text, label in zip(df['text'], df['label']):
         D.append((str(text), int(label)))
     return D
 def process_one_data(self, path):
     df = load_df(path)
     x = self.emb.get_sentence_list_emb_mean(
         df['text'].tolist(), max_len=self.max_len)
     y = df['label']
     return x, y