def __init__(self, configs: dict): # 加载参数字典 self.configs = configs # 设置模型以及路径 if self.configs["model_select"] == "fasttext": self.model_path = self.configs["model_path"]["fasttext"] elif self.configs["model_select"] == "lstm_base": self.model_path = self.configs["model_path"]["lstm_base"] elif self.configs["model_select"] == "lstm_pack": self.model_path = self.configs["model_path"]["lstm_pack"] elif self.configs["model_select"] == "textcnn": self.model_path = self.configs["model_path"]["textcnn"] elif self.configs["model_select"] == "bert": self.model_path = self.configs["model_path"]["bert"] elif self.configs["model_select"] == "electra": self.model_path = self.configs["model_path"]["electra"] elif self.configs["model_select"] == "xlnet": self.model_path = self.configs["model_path"]["xlnet"] # 设置模型名称 self.model_name = self.configs["model_select"] # 设置评估推断的数据文件 self.eval_data_file = self.configs["eval_data_file"] # 设置标签转换路径,需要将预测的label转为真实标签 self.label2index_json_path = self.configs["eval_label_transfer_file"] # 设置token映射词表,在自定义模型中的tokenizer使用 self.token2index_json_path = self.configs["eval_token_transfer_file"] # 设置分词器 if self.model_name in ["fasttext", "lstm_base", "lstm_pack", "textcnn"]: self.tokenizer = SequenceTokenizer(load_json(self.token2index_json_path)) elif self.model_name == "bert": self.tokenizer = BertTokenizer.from_pretrained(self.configs["pretrained_model_path"]["bert"]) elif self.model_name == "electra": self.tokenizer = ElectraTokenizer.from_pretrained(self.configs["pretrained_model_path"]["electra"]) elif self.model_name == "xlnet": self.tokenizer = XLNetTokenizer.from_pretrained(self.configs["pretrained_model_path"]["xlnet"]) # 设置label转换器 self.label_tokenizer = ClassificationLabelTokenizer(load_json(self.label2index_json_path)) # 加载模型 self.model = torch.load(self.model_path)
def predict(self): eval_df = load_xlsx(self.eval_data_file) data_x = list(eval_df["text"]) data_y = list(eval_df["intent"]) d = load_json(self.label2index_json_path) # 加载干净的数据集 data_x_clear, data_y_clear = list(), list() for index in range(len(data_x)): if data_y[index] != "[]" and "," not in data_y[index] and data_y[index] in d.keys(): data_x_clear.append(data_x[index]) data_y_clear.append(data_y[index]) # print(len(data_x_clear)) # return data_x_clear, data_y_clear predict_result = list() for index in tqdm(range(len(data_x_clear))): predict_result.append(self.predict_one(data_x_clear[index], data_y_clear[index])) save_df = pd.DataFrame({ "text":data_x_clear, "intent":data_y_clear, "predict":predict_result }) save_df.to_excel("final_.xlsx")
def __init__(self, label2index: dict): if isinstance(label2index, dict): self.label2index = label2index else: self.label2index = load_json(label2index) self.index2label = { value: key for key, value in self.label2index.items() }
def __init__(self, configs): self.configs = configs self.predicted_dataset_path = self.configs["predicted_dataset_path"] self.label2index_json_path = self.configs["label2index_json_path"] # dataframe self.eval_data = load_xlsx(self.predicted_dataset_path) # dict,可视化时将索引转为label self.label2index = load_json(self.label2index_json_path) self.index2label = {value: key for key, value in self.label2index.items()} # confusion_matrix self.confusion_matrix = torch.zeros((len(self.label2index), len(self.label2index))) # 初始化混淆矩阵 self.confusion_matrix_init() # 记录测试集中不存在的类别(list) self.not_in_test = self.confusion_matrix_clear()
def get_eval_data(self): data_all_dataframe = load_xlsx(self.eval_path) data_x = list(data_all_dataframe["text"]) data_y = list(data_all_dataframe["intent"]) # for index in range(len(data_x) - 1, -1, -1): # # 将多标签的数据替换为第一个标签 # if "," in data_y[index]: # label_split = data_y[index].split(",") # label = label_split[0][2:-1] # data_y[index] = "['" + label + "']" d = load_json(self.label2index_json_path) data_x_clear, data_y_clear = list(), list() for index in range(len(data_x)): if data_y[index] != "[]" and "," not in data_y[index] and data_y[ index] in d.keys(): data_x_clear.append(data_x[index]) data_y_clear.append(data_y[index]) # print(len(data_x_clear)) return data_x_clear, data_y_clear
def __init__(self, train_record_json): # 加载训练过程中产生的数据 self.train_params = load_json(train_record_json)