def write_xtreme_classification(task,
                                model,
                                input_file,
                                output_file,
                                predict_batch_size,
                                seq_length,
                                class_names,
                                translated_input_file=None,
                                test_time_aug_wgt=0.3):
  """Makes classification predictions for xtreme and writes to output file."""
  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
      input_path=input_file,
      seq_length=seq_length,
      is_training=False,
      label_type='int',
      global_batch_size=predict_batch_size,
      drop_remainder=False,
      include_example_id=True)
  if translated_input_file is not None:
    data_config_aug = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path=translated_input_file,
            seq_length=seq_length,
            is_training=False,
            label_type='int',
            global_batch_size=predict_batch_size,
            drop_remainder=False,
            include_example_id=True))
  else:
    data_config_aug = None
  predictions = sentence_prediction.predict(task, data_config, model,
                                            data_config_aug, test_time_aug_wgt)
  with tf.io.gfile.GFile(output_file, 'w') as writer:
    for prediction in predictions:
      writer.write('%s\n' % class_names[prediction])
Exemple #2
0
  def test_prediction(self, num_classes):
    task_config = sentence_prediction.SentencePredictionConfig(
        model=self.get_model_config(num_classes=num_classes),
        train_data=self._train_data_config)
    task = sentence_prediction.SentencePredictionTask(task_config)
    model = task.build_model()

    test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record")
    seq_length = 16
    num_examples = 100
    _create_fake_dataset(
        test_data_path,
        seq_length=seq_length,
        num_classes=num_classes,
        num_examples=num_examples)

    test_data_config = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path=test_data_path,
            seq_length=seq_length,
            is_training=False,
            label_type="int" if num_classes > 1 else "float",
            global_batch_size=16,
            drop_remainder=False,
            include_example_id=True))

    predictions = sentence_prediction.predict(task, test_data_config, model)
    self.assertLen(predictions, num_examples)
    for prediction in predictions:
      self.assertEqual(prediction.dtype,
                       tf.int64 if num_classes > 1 else tf.float32)
Exemple #3
0
  def test_np_metrics_cola_partial_batch(self):
    train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record")
    num_examples = 5
    global_batch_size = 8
    seq_length = 16
    _create_fake_dataset(
        train_data_path,
        seq_length=seq_length,
        num_classes=2,
        num_examples=num_examples)

    train_data_config = (
        sentence_prediction_dataloader.SentencePredictionDataConfig(
            input_path=train_data_path,
            seq_length=seq_length,
            is_training=True,
            label_type="int",
            global_batch_size=global_batch_size,
            drop_remainder=False,
            include_example_id=True))

    config = sentence_prediction.SentencePredictionConfig(
        metric_type="matthews_corrcoef",
        model=self.get_model_config(2),
        train_data=train_data_config)
    outputs = self._run_task(config)
    self.assertEqual(outputs["sentence_prediction"].shape.as_list(), [8, 1])
Exemple #4
0
 def test_load_dataset_with_label_mapping(self):
     input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
     batch_size = 10
     seq_length = 128
     _create_fake_preprocessed_dataset(input_path, seq_length, 'int')
     data_config = loader.SentencePredictionDataConfig(
         input_path=input_path,
         seq_length=seq_length,
         global_batch_size=batch_size,
         label_type='int',
         label_name=('label_ids', 'next_sentence_labels'))
     dataset = loader.SentencePredictionDataLoader(data_config).load()
     features = next(iter(dataset))
     self.assertCountEqual([
         'input_word_ids', 'input_mask', 'input_type_ids',
         'next_sentence_labels', 'label_ids'
     ], features.keys())
     self.assertEqual(features['input_word_ids'].shape,
                      (batch_size, seq_length))
     self.assertEqual(features['input_mask'].shape,
                      (batch_size, seq_length))
     self.assertEqual(features['input_type_ids'].shape,
                      (batch_size, seq_length))
     self.assertEqual(features['label_ids'].shape, (batch_size, ))
     self.assertEqual(features['label_ids'].dtype, tf.int32)
     self.assertEqual(features['next_sentence_labels'].shape,
                      (batch_size, ))
     self.assertEqual(features['next_sentence_labels'].dtype, tf.int32)
Exemple #5
0
def write_superglue_classification(task,
                                   model,
                                   input_file,
                                   output_file,
                                   predict_batch_size,
                                   seq_length,
                                   class_names,
                                   label_type='int'):
    """Makes classification predictions for superglue and writes to output file.

  Args:
    task: `Task` instance.
    model: `keras.Model` instance.
    input_file: Input test data file path.
    output_file: Output test data file path.
    predict_batch_size: Batch size for prediction.
    seq_length: Input sequence length.
    class_names: List of string class names.
    label_type: String denoting label type ('int', 'float'), defaults to 'int'.
  """
    if label_type not in 'int':
        raise ValueError(
            'Unsupported `label_type`. Given: %s, expected `int` or '
            '`float`.' % label_type)

    data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
        input_path=input_file,
        global_batch_size=predict_batch_size,
        is_training=False,
        seq_length=seq_length,
        label_type=label_type,
        drop_remainder=False,
        include_example_id=True)
    predictions = sentence_prediction.predict(task, data_config, model)

    with tf.io.gfile.GFile(output_file, 'w') as writer:
        for index, prediction in enumerate(predictions):
            if label_type == 'int':
                # Classification.
                writer.write('{"idx": %d, "label": %s}\n' %
                             (index, class_names[prediction]))
Exemple #6
0
 def test_load_dataset(self, label_type, expected_label_type):
     input_path = os.path.join(self.get_temp_dir(), 'train.tf_record')
     batch_size = 10
     seq_length = 128
     _create_fake_preprocessed_dataset(input_path, seq_length, label_type)
     data_config = loader.SentencePredictionDataConfig(
         input_path=input_path,
         seq_length=seq_length,
         global_batch_size=batch_size,
         label_type=label_type)
     dataset = loader.SentencePredictionDataLoader(data_config).load()
     features, labels = next(iter(dataset))
     self.assertCountEqual(
         ['input_word_ids', 'input_mask', 'input_type_ids'],
         features.keys())
     self.assertEqual(features['input_word_ids'].shape,
                      (batch_size, seq_length))
     self.assertEqual(features['input_mask'].shape,
                      (batch_size, seq_length))
     self.assertEqual(features['input_type_ids'].shape,
                      (batch_size, seq_length))
     self.assertEqual(labels.shape, (batch_size, ))
     self.assertEqual(labels.dtype, expected_label_type)
Exemple #7
0
 def setUp(self):
   super(SentencePredictionTaskTest, self).setUp()
   self._train_data_config = (
       sentence_prediction_dataloader.SentencePredictionDataConfig(
           input_path="dummy", seq_length=128, global_batch_size=1))
def write_glue_classification(task,
                              model,
                              input_file,
                              output_file,
                              predict_batch_size,
                              seq_length,
                              class_names,
                              label_type='int',
                              min_float_value=None,
                              max_float_value=None):
  """Makes classification predictions for glue and writes to output file.

  Args:
    task: `Task` instance.
    model: `keras.Model` instance.
    input_file: Input test data file path.
    output_file: Output test data file path.
    predict_batch_size: Batch size for prediction.
    seq_length: Input sequence length.
    class_names: List of string class names.
    label_type: String denoting label type ('int', 'float'), defaults to 'int'.
    min_float_value: If set, predictions will be min-clipped to this value (only
      for regression when `label_type` is set to 'float'). Defaults to `None`
      (no clipping).
    max_float_value: If set, predictions will be max-clipped to this value (only
      for regression when `label_type` is set to 'float'). Defaults to `None`
      (no clipping).
  """
  if label_type not in ('int', 'float'):
    raise ValueError('Unsupported `label_type`. Given: %s, expected `int` or '
                     '`float`.' % label_type)

  data_config = sentence_prediction_dataloader.SentencePredictionDataConfig(
      input_path=input_file,
      global_batch_size=predict_batch_size,
      is_training=False,
      seq_length=seq_length,
      label_type=label_type,
      drop_remainder=False,
      include_example_id=True)
  predictions = sentence_prediction.predict(task, data_config, model)

  if label_type == 'float':
    min_float_value = (-sys.float_info.max
                       if min_float_value is None else min_float_value)
    max_float_value = (
        sys.float_info.max if max_float_value is None else max_float_value)

    # Clip predictions to range [min_float_value, max_float_value].
    predictions = [
        min(max(prediction, min_float_value), max_float_value)
        for prediction in predictions
    ]

  with tf.io.gfile.GFile(output_file, 'w') as writer:
    writer.write('index\tprediction\n')
    for index, prediction in enumerate(predictions):
      if label_type == 'float':
        # Regression.
        writer.write('%d\t%.3f\n' % (index, prediction))
      else:
        # Classification.
        writer.write('%d\t%s\n' % (index, class_names[prediction]))