def write_xtreme_classification(task, model, input_file, output_file, predict_batch_size, seq_length, class_names, translated_input_file=None, test_time_aug_wgt=0.3): """Makes classification predictions for xtreme and writes to output file.""" data_config = sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=input_file, seq_length=seq_length, is_training=False, label_type='int', global_batch_size=predict_batch_size, drop_remainder=False, include_example_id=True) if translated_input_file is not None: data_config_aug = ( sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=translated_input_file, seq_length=seq_length, is_training=False, label_type='int', global_batch_size=predict_batch_size, drop_remainder=False, include_example_id=True)) else: data_config_aug = None predictions = sentence_prediction.predict(task, data_config, model, data_config_aug, test_time_aug_wgt) with tf.io.gfile.GFile(output_file, 'w') as writer: for prediction in predictions: writer.write('%s\n' % class_names[prediction])
def test_prediction(self, num_classes): task_config = sentence_prediction.SentencePredictionConfig( model=self.get_model_config(num_classes=num_classes), train_data=self._train_data_config) task = sentence_prediction.SentencePredictionTask(task_config) model = task.build_model() test_data_path = os.path.join(self.get_temp_dir(), "test.tf_record") seq_length = 16 num_examples = 100 _create_fake_dataset( test_data_path, seq_length=seq_length, num_classes=num_classes, num_examples=num_examples) test_data_config = ( sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=test_data_path, seq_length=seq_length, is_training=False, label_type="int" if num_classes > 1 else "float", global_batch_size=16, drop_remainder=False, include_example_id=True)) predictions = sentence_prediction.predict(task, test_data_config, model) self.assertLen(predictions, num_examples) for prediction in predictions: self.assertEqual(prediction.dtype, tf.int64 if num_classes > 1 else tf.float32)
def test_np_metrics_cola_partial_batch(self): train_data_path = os.path.join(self.get_temp_dir(), "train.tf_record") num_examples = 5 global_batch_size = 8 seq_length = 16 _create_fake_dataset( train_data_path, seq_length=seq_length, num_classes=2, num_examples=num_examples) train_data_config = ( sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=train_data_path, seq_length=seq_length, is_training=True, label_type="int", global_batch_size=global_batch_size, drop_remainder=False, include_example_id=True)) config = sentence_prediction.SentencePredictionConfig( metric_type="matthews_corrcoef", model=self.get_model_config(2), train_data=train_data_config) outputs = self._run_task(config) self.assertEqual(outputs["sentence_prediction"].shape.as_list(), [8, 1])
def test_load_dataset_with_label_mapping(self): input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') batch_size = 10 seq_length = 128 _create_fake_preprocessed_dataset(input_path, seq_length, 'int') data_config = loader.SentencePredictionDataConfig( input_path=input_path, seq_length=seq_length, global_batch_size=batch_size, label_type='int', label_name=('label_ids', 'next_sentence_labels')) dataset = loader.SentencePredictionDataLoader(data_config).load() features = next(iter(dataset)) self.assertCountEqual([ 'input_word_ids', 'input_mask', 'input_type_ids', 'next_sentence_labels', 'label_ids' ], features.keys()) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['label_ids'].shape, (batch_size, )) self.assertEqual(features['label_ids'].dtype, tf.int32) self.assertEqual(features['next_sentence_labels'].shape, (batch_size, )) self.assertEqual(features['next_sentence_labels'].dtype, tf.int32)
def write_superglue_classification(task, model, input_file, output_file, predict_batch_size, seq_length, class_names, label_type='int'): """Makes classification predictions for superglue and writes to output file. Args: task: `Task` instance. model: `keras.Model` instance. input_file: Input test data file path. output_file: Output test data file path. predict_batch_size: Batch size for prediction. seq_length: Input sequence length. class_names: List of string class names. label_type: String denoting label type ('int', 'float'), defaults to 'int'. """ if label_type not in 'int': raise ValueError( 'Unsupported `label_type`. Given: %s, expected `int` or ' '`float`.' % label_type) data_config = sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=input_file, global_batch_size=predict_batch_size, is_training=False, seq_length=seq_length, label_type=label_type, drop_remainder=False, include_example_id=True) predictions = sentence_prediction.predict(task, data_config, model) with tf.io.gfile.GFile(output_file, 'w') as writer: for index, prediction in enumerate(predictions): if label_type == 'int': # Classification. writer.write('{"idx": %d, "label": %s}\n' % (index, class_names[prediction]))
def test_load_dataset(self, label_type, expected_label_type): input_path = os.path.join(self.get_temp_dir(), 'train.tf_record') batch_size = 10 seq_length = 128 _create_fake_preprocessed_dataset(input_path, seq_length, label_type) data_config = loader.SentencePredictionDataConfig( input_path=input_path, seq_length=seq_length, global_batch_size=batch_size, label_type=label_type) dataset = loader.SentencePredictionDataLoader(data_config).load() features, labels = next(iter(dataset)) self.assertCountEqual( ['input_word_ids', 'input_mask', 'input_type_ids'], features.keys()) self.assertEqual(features['input_word_ids'].shape, (batch_size, seq_length)) self.assertEqual(features['input_mask'].shape, (batch_size, seq_length)) self.assertEqual(features['input_type_ids'].shape, (batch_size, seq_length)) self.assertEqual(labels.shape, (batch_size, )) self.assertEqual(labels.dtype, expected_label_type)
def setUp(self): super(SentencePredictionTaskTest, self).setUp() self._train_data_config = ( sentence_prediction_dataloader.SentencePredictionDataConfig( input_path="dummy", seq_length=128, global_batch_size=1))
def write_glue_classification(task, model, input_file, output_file, predict_batch_size, seq_length, class_names, label_type='int', min_float_value=None, max_float_value=None): """Makes classification predictions for glue and writes to output file. Args: task: `Task` instance. model: `keras.Model` instance. input_file: Input test data file path. output_file: Output test data file path. predict_batch_size: Batch size for prediction. seq_length: Input sequence length. class_names: List of string class names. label_type: String denoting label type ('int', 'float'), defaults to 'int'. min_float_value: If set, predictions will be min-clipped to this value (only for regression when `label_type` is set to 'float'). Defaults to `None` (no clipping). max_float_value: If set, predictions will be max-clipped to this value (only for regression when `label_type` is set to 'float'). Defaults to `None` (no clipping). """ if label_type not in ('int', 'float'): raise ValueError('Unsupported `label_type`. Given: %s, expected `int` or ' '`float`.' % label_type) data_config = sentence_prediction_dataloader.SentencePredictionDataConfig( input_path=input_file, global_batch_size=predict_batch_size, is_training=False, seq_length=seq_length, label_type=label_type, drop_remainder=False, include_example_id=True) predictions = sentence_prediction.predict(task, data_config, model) if label_type == 'float': min_float_value = (-sys.float_info.max if min_float_value is None else min_float_value) max_float_value = ( sys.float_info.max if max_float_value is None else max_float_value) # Clip predictions to range [min_float_value, max_float_value]. predictions = [ min(max(prediction, min_float_value), max_float_value) for prediction in predictions ] with tf.io.gfile.GFile(output_file, 'w') as writer: writer.write('index\tprediction\n') for index, prediction in enumerate(predictions): if label_type == 'float': # Regression. writer.write('%d\t%.3f\n' % (index, prediction)) else: # Classification. writer.write('%d\t%s\n' % (index, class_names[prediction]))