def data_process(self, sep='\t'): """ 数据处理 :return: """ if '.csv' not in self.train_data_path: self.train_data_path = data2csv(self.train_data_path, sep) self.i2l, self.l2i, self.train_data = data_preprocess( self.train_data_path) print(self.l2i) self.categories = len(self.i2l) if self.valid_data_path: if '.csv' not in self.valid_data_path: self.valid_data_path = data2csv(self.valid_data_path, sep) _, _, self.valid_data = data_preprocess(self.valid_data_path) else: data_len = len(self.train_data) indexs = list(range(data_len)) random.shuffle(indexs) sep = int(data_len * 0.8) self.train_data, self.valid_data = [ self.train_data[i] for i in indexs[:sep] ], [self.train_data[i] for i in indexs[sep:]] if self.test_data_path: _, _, self.test_data = data_preprocess(self.valid_data_path)
def data_process(self, sep='\t'): """ 数据处理 :return: """ if '.csv' not in self.train_data_path: self.train_data_path = data2csv(self.train_data_path, sep) self.index2label, self.label2index, self.labels, train_data = data_preprocess( self.train_data_path) self.num_classes = len(self.index2label) if self.valid_data_path: if '.csv' not in self.valid_data_path: self.valid_data_path = data2csv(self.valid_data_path, sep) _, _, _, valid_data = data_preprocess(self.valid_data_path) else: train_data, valid_data = split(train_data, self.split) if self.test_data_path: if '.csv' not in self.test_data_path: self.test_data_path = data2csv(self.test_data_path, sep) _, _, _, test_data = data_preprocess(self.test_data_path) else: test_data = [] self.train_generator = Data_Generator(train_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.valid_generator = Data_Generator(valid_data, self.label2index, self.tokenizer, self.batch_size, self.max_len) self.test_generator = Data_Generator(test_data, self.label2index, self.tokenizer, self.batch_size, self.max_len)
def data_score(self, text_path): time_start = time.time() # 测试集的准确率 if '.csv' not in text_path: text_path = data2csv(text_path, sep='\t') _, _, _, test_data = data_preprocess(text_path) y_pred = [] y_true = [] for label, text in test_data: y_true.append(self.index2label[str(label)]) token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.max_len) # maxlen 新版本 token_ids = sequence_padding([token_ids], length=self.max_len) segment_ids = sequence_padding([segment_ids], length=self.max_len) pred = self.model.predict([token_ids, segment_ids]) pred = np.argmax(pred[0]) y_pred.append(self.index2label[str(pred)]) print("data pred ok!") # 评估 target_names = [str(label) for label in self.labels] report_predict = classification_report(y_true, y_pred, target_names=target_names, digits=9) print(report_predict) print("耗时:" + str(time.time() - time_start))