Ejemplo n.º 1
0
 def data_process(self, sep='\t'):
     """
     数据处理
     :return:
     """
     if '.csv' not in self.train_data_path:
         self.train_data_path = data2csv(self.train_data_path, sep)
     self.i2l, self.l2i, self.train_data = data_preprocess(
         self.train_data_path)
     print(self.l2i)
     self.categories = len(self.i2l)
     if self.valid_data_path:
         if '.csv' not in self.valid_data_path:
             self.valid_data_path = data2csv(self.valid_data_path, sep)
         _, _, self.valid_data = data_preprocess(self.valid_data_path)
     else:
         data_len = len(self.train_data)
         indexs = list(range(data_len))
         random.shuffle(indexs)
         sep = int(data_len * 0.8)
         self.train_data, self.valid_data = [
             self.train_data[i] for i in indexs[:sep]
         ], [self.train_data[i] for i in indexs[sep:]]
     if self.test_data_path:
         _, _, self.test_data = data_preprocess(self.valid_data_path)
Ejemplo n.º 2
0
 def data_process(self, sep='\t'):
     """
     数据处理
     :return:
     """
     if '.csv' not in self.train_data_path:
         self.train_data_path = data2csv(self.train_data_path, sep)
     self.index2label, self.label2index, self.labels, train_data = data_preprocess(
         self.train_data_path)
     self.num_classes = len(self.index2label)
     if self.valid_data_path:
         if '.csv' not in self.valid_data_path:
             self.valid_data_path = data2csv(self.valid_data_path, sep)
         _, _, _, valid_data = data_preprocess(self.valid_data_path)
     else:
         train_data, valid_data = split(train_data, self.split)
     if self.test_data_path:
         if '.csv' not in self.test_data_path:
             self.test_data_path = data2csv(self.test_data_path, sep)
         _, _, _, test_data = data_preprocess(self.test_data_path)
     else:
         test_data = []
     self.train_generator = Data_Generator(train_data, self.label2index,
                                           self.tokenizer, self.batch_size,
                                           self.max_len)
     self.valid_generator = Data_Generator(valid_data, self.label2index,
                                           self.tokenizer, self.batch_size,
                                           self.max_len)
     self.test_generator = Data_Generator(test_data, self.label2index,
                                          self.tokenizer, self.batch_size,
                                          self.max_len)
Ejemplo n.º 3
0
    def data_score(self, text_path):
        time_start = time.time()
        # 测试集的准确率
        if '.csv' not in text_path:
            text_path = data2csv(text_path, sep='\t')
        _, _, _, test_data = data_preprocess(text_path)
        y_pred = []
        y_true = []
        for label, text in test_data:
            y_true.append(self.index2label[str(label)])
            token_ids, segment_ids = self.tokenizer.encode(text, max_length=self.max_len)  # maxlen 新版本
            token_ids = sequence_padding([token_ids], length=self.max_len)
            segment_ids = sequence_padding([segment_ids], length=self.max_len)
            pred = self.model.predict([token_ids, segment_ids])
            pred = np.argmax(pred[0])
            y_pred.append(self.index2label[str(pred)])

        print("data pred ok!")
        # 评估
        target_names = [str(label) for label in self.labels]
        report_predict = classification_report(y_true, y_pred,
                                               target_names=target_names, digits=9)
        print(report_predict)
        print("耗时:" + str(time.time() - time_start))