コード例 #1
0
    def __init__(self, configs: dict):
        
        # 加载参数字典
        self.configs = configs
        
        # 设置模型以及路径
        if self.configs["model_select"] == "fasttext":
            self.model_path = self.configs["model_path"]["fasttext"]
        elif self.configs["model_select"] == "lstm_base":
            self.model_path = self.configs["model_path"]["lstm_base"]
        elif self.configs["model_select"] == "lstm_pack":
            self.model_path = self.configs["model_path"]["lstm_pack"]
        elif self.configs["model_select"] == "textcnn":
            self.model_path = self.configs["model_path"]["textcnn"]
        elif self.configs["model_select"] == "bert":
            self.model_path = self.configs["model_path"]["bert"]
        elif self.configs["model_select"] == "electra":
            self.model_path = self.configs["model_path"]["electra"]
        elif self.configs["model_select"] == "xlnet":
            self.model_path = self.configs["model_path"]["xlnet"]

        # 设置模型名称
        self.model_name = self.configs["model_select"]

        # 设置评估推断的数据文件
        self.eval_data_file = self.configs["eval_data_file"]

        # 设置标签转换路径,需要将预测的label转为真实标签
        self.label2index_json_path = self.configs["eval_label_transfer_file"]

        # 设置token映射词表,在自定义模型中的tokenizer使用
        self.token2index_json_path = self.configs["eval_token_transfer_file"]

        # 设置分词器
        if self.model_name in ["fasttext", "lstm_base", "lstm_pack", "textcnn"]:
            self.tokenizer = SequenceTokenizer(load_json(self.token2index_json_path))

        elif self.model_name == "bert":
            self.tokenizer = BertTokenizer.from_pretrained(self.configs["pretrained_model_path"]["bert"])
        elif self.model_name == "electra":
            self.tokenizer = ElectraTokenizer.from_pretrained(self.configs["pretrained_model_path"]["electra"])
        elif self.model_name == "xlnet":
            self.tokenizer = XLNetTokenizer.from_pretrained(self.configs["pretrained_model_path"]["xlnet"])

        # 设置label转换器
        self.label_tokenizer = ClassificationLabelTokenizer(load_json(self.label2index_json_path))

        # 加载模型
        self.model = torch.load(self.model_path)
コード例 #2
0
    def predict(self):
        
        eval_df = load_xlsx(self.eval_data_file)
        data_x = list(eval_df["text"])
        data_y = list(eval_df["intent"])

        d = load_json(self.label2index_json_path)

        # 加载干净的数据集
        data_x_clear, data_y_clear = list(), list()
        for index in range(len(data_x)):
            if data_y[index] != "[]" and "," not in data_y[index] and data_y[index] in d.keys():
                data_x_clear.append(data_x[index])
                data_y_clear.append(data_y[index])
        # print(len(data_x_clear))
        # return data_x_clear, data_y_clear

        predict_result = list()
        for index in tqdm(range(len(data_x_clear))):
            predict_result.append(self.predict_one(data_x_clear[index], data_y_clear[index]))
     
        save_df = pd.DataFrame({
            "text":data_x_clear,
            "intent":data_y_clear,
            "predict":predict_result
        })

        save_df.to_excel("final_.xlsx")
コード例 #3
0
 def __init__(self, label2index: dict):
     if isinstance(label2index, dict):
         self.label2index = label2index
     else:
         self.label2index = load_json(label2index)
     self.index2label = {
         value: key
         for key, value in self.label2index.items()
     }
コード例 #4
0
    def __init__(self, configs):
        
        self.configs = configs
        self.predicted_dataset_path = self.configs["predicted_dataset_path"]
        self.label2index_json_path = self.configs["label2index_json_path"]

        # dataframe
        self.eval_data = load_xlsx(self.predicted_dataset_path)
        # dict,可视化时将索引转为label
        self.label2index = load_json(self.label2index_json_path)
        self.index2label = {value: key for key, value in self.label2index.items()}

        # confusion_matrix
        self.confusion_matrix = torch.zeros((len(self.label2index), len(self.label2index)))
        # 初始化混淆矩阵
        self.confusion_matrix_init()
        # 记录测试集中不存在的类别(list)
        self.not_in_test = self.confusion_matrix_clear()
コード例 #5
0
    def get_eval_data(self):
        data_all_dataframe = load_xlsx(self.eval_path)
        data_x = list(data_all_dataframe["text"])
        data_y = list(data_all_dataframe["intent"])

        # for index in range(len(data_x) - 1, -1, -1):
        #     # 将多标签的数据替换为第一个标签
        #     if "," in data_y[index]:
        #         label_split = data_y[index].split(",")
        #         label = label_split[0][2:-1]
        #         data_y[index] = "['" + label + "']"

        d = load_json(self.label2index_json_path)

        data_x_clear, data_y_clear = list(), list()
        for index in range(len(data_x)):
            if data_y[index] != "[]" and "," not in data_y[index] and data_y[
                    index] in d.keys():
                data_x_clear.append(data_x[index])
                data_y_clear.append(data_y[index])
        # print(len(data_x_clear))
        return data_x_clear, data_y_clear
コード例 #6
0
    def __init__(self, train_record_json):

        # 加载训练过程中产生的数据
        self.train_params = load_json(train_record_json)