Ejemplo n.º 1
0
    def generate_tf_dataset(self, data_path, batch_size):
        """
        生成TFDataSet,用于训练模型前的准备
        :param data_path: 数据保存路径
        :param batch_size: 训练阶段 batch_size大小
        :return:
        """
        # process raw data
        train_examples = self.data_processor.get_train_examples(data_path)
        valid_examples = self.data_processor.get_dev_examples(data_path)

        # calculate steps
        train_steps = calculate_steps(len(train_examples), batch_size)
        valid_steps = calculate_steps(len(valid_examples), batch_size)

        # convert tasks to tf_dataset
        train_dataset = convert_examples_to_features(
            train_examples,
            self.tokenizer,
            max_length=self.max_length,
            task=self.task,
            return_tensors='tf')
        valid_dataset = convert_examples_to_features(
            valid_examples,
            self.tokenizer,
            max_length=self.max_length,
            task=self.task,
            return_tensors='tf')

        # preprocess tf_dataset
        train_dataset = train_dataset.shuffle(10000).batch(batch_size)
        valid_dataset = valid_dataset.batch(batch_size)

        return (train_dataset, train_steps), (valid_dataset, valid_steps)
Ejemplo n.º 2
0
 def evaluate_op(self, test_data_path, batch_size=64):
     """
     模型evaluation step
     :param test_data_path: 测试数据保存路径
     :param batch_size: evaluation阶段的batch_size
     :return:
     """
     # 模型加载
     trained_model = self.get_trained_model()
     trained_model = self.get_compiled_model(trained_model)
     # 原始数据转tf-dataset
     test_examples = self.data_processor.get_test_examples(test_data_path)
     test_steps = calculate_steps(len(test_examples), batch_size)
     test_dataset = convert_examples_to_features(test_examples,
                                                 self.tokenizer,
                                                 max_length=self.max_length,
                                                 task=self.task,
                                                 return_tensors='tf')
     test_dataset = test_dataset.batch(batch_size)
     # model evaluation step
     evaluation_start_time = time.time()
     evaluation_loss, evaluation_acc = trained_model.evaluate(
         test_dataset, steps=test_steps)
     cost_time = time.time() - evaluation_start_time
     print(f"Evaluate step loss: {evaluation_loss}")
     print(f"Evaluate step accuracy: {evaluation_acc}")
     with open(os.path.join(self.saved_model_path, 'evaluate.log'),
               'w') as f:
         f.write(
             f'Evaluation cost time: {cost_time}\nEvaluate loss: {evaluation_loss}\n'
             f'Evaluate accuracy: {evaluation_acc}')
 def evaluate_op(self, test_data_path, batch_size=64):
     """
     模型 evaluation step
     :param test_data_path: 测试集保存路径
     :param batch_size: evaluation阶段batch_size
     :return:
     """
     # 模型加载
     trained_model = self.get_trained_model()
     trained_model = self.get_compiled_model(trained_model)
     # 原始数据转tf-dataset
     test_examples = self.data_processor.get_test_examples(test_data_path)
     test_label_list = self.data_processor.get_labels(test_data_path)
     print(test_label_list)
     test_steps = calculate_steps(len(test_examples), batch_size)
     test_dataset = convert_examples_to_features(test_examples,
                                                 self.tokenizer,
                                                 max_length=self.max_length,
                                                 label_list=test_label_list,
                                                 task=self.task,
                                                 return_tensors='tf')
     test_dataset = test_dataset.batch(batch_size)
     # model evaluation step
     evaluation_loss, evaluation_acc = trained_model.evaluate(
         test_dataset, steps=test_steps)
     print(f"Evaluate step loss: {evaluation_loss}")
     print(f"Evaluate step accuracy: {evaluation_acc}")
    def generate_tf_dataset(self, data_path, batch_size):
        """
        生成TFDataSet,用于训练模型前的准备
        :param data_path: 训练数据保存路径
        :param batch_size: batch_size
        :return:
        """
        # process raw data
        train_examples = self.data_processor.get_train_examples(data_path)
        train_label_list = self.data_processor.get_labels()
        valid_examples = self.data_processor.get_dev_examples(data_path)

        # calculate steps
        train_steps = calculate_steps(len(train_examples), batch_size)
        valid_steps = calculate_steps(len(valid_examples), batch_size)

        # convert tasks to tf_dataset
        train_dataset = convert_examples_to_features(
            train_examples,
            self.tokenizer,
            max_length=self.max_length,
            label_list=train_label_list,
            task=self.task,
            return_tensors='tf',
            save_id2label_path=os.path.join(self.saved_model_path,
                                            'id2label.pkl'))
        valid_dataset = convert_examples_to_features(
            valid_examples,
            self.tokenizer,
            max_length=self.max_length,
            label_list=train_label_list,
            task=self.task,
            return_tensors='tf')

        # preprocess tf_dataset
        train_dataset = train_dataset.batch(batch_size)
        valid_dataset = valid_dataset.batch(batch_size)

        return (train_dataset, train_steps), (valid_dataset, valid_steps)
Ejemplo n.º 5
0
 def evaluate_op(self, test_data_path, batch_size=64):
     """
     模型evaluation step
     :param test_data_path: 测试数据保存路径
     :param batch_size: evaluation阶段的batch_size
     :return:
     """
     # 模型加载
     trained_model = self.get_trained_model()
     if self.loss_type == 'crf':
         evaluate_metric = tf.keras.metrics.Accuracy(name='accuracy')
     else:
         evaluate_metric = tf.keras.metrics.SparseCategoricalAccuracy(
             'accuracy')
     loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
         from_logits=True)
     # 原始数据转tf-dataset
     test_examples = self.data_processor.get_test_examples(test_data_path)
     test_steps = calculate_steps(len(test_examples), batch_size)
     test_dataset = convert_examples_to_features_labeling(
         test_examples,
         self.labels,
         self.max_length,
         self.tokenizer,
         return_tensors='tf')
     test_dataset = test_dataset.batch(batch_size)
     # model evaluation step
     evaluation_start_time = time.time()
     test_bar = tf.keras.utils.Progbar(
         test_steps, stateful_metrics=['test_loss', 'test_acc'])
     # Run a validation loop at the end of each epoch.
     for test_step, (x_batch_test, y_batch_test) in enumerate(test_dataset):
         sequence_lengths_test = tf.math.reduce_sum(tf.cast(
             tf.math.not_equal(x_batch_test['input_ids'], 0),
             dtype=tf.int32),
                                                    axis=-1)
         if self.loss_type == 'crf':
             test_logits, test_logits_ = trained_model(x_batch_test,
                                                       training=False)
             masks = tf.sequence_mask(sequence_lengths_test,
                                      maxlen=tf.shape(test_logits_)[1],
                                      dtype=tf.bool)
             log_likelihood, _ = tfa.text.crf_log_likelihood(
                 test_logits_, y_batch_test, sequence_lengths_test,
                 trained_model.crf.trans_params)
             loss = -tf.reduce_mean(log_likelihood)
             active_logits_test = tf.boolean_mask(test_logits, masks)
             active_labels_test = tf.boolean_mask(y_batch_test, masks)
         else:
             test_logits = trained_model(x_batch_test, training=False)[0]
             masks = tf.sequence_mask(sequence_lengths_test,
                                      maxlen=tf.shape(test_logits)[1],
                                      dtype=tf.bool)
             active_logits_test = tf.boolean_mask(test_logits, masks)
             active_labels_test = tf.boolean_mask(y_batch_test, masks)
             cross_entropy = loss_fn(active_labels_test, active_logits_test)
             loss = tf.reduce_sum(cross_entropy) * (1.0 / batch_size)
         # Update val metrics
         evaluate_metric(active_labels_test, active_logits_test)
         tmp_accuracy = evaluate_metric.result()
         # Logging
         test_bar.update(test_step + 1,
                         values=[('test_acc', float(loss)),
                                 ('val_acc', float(tmp_accuracy))])
     cost_time = time.time() - evaluation_start_time
     with open(os.path.join(self.saved_model_path, 'evaluate.log'),
               'w') as f:
         f.write(
             f'Evaluation cost time: {cost_time}\nEvaluate loss: {loss}\n'
             f'Evaluate accuracy: {tmp_accuracy}')