def build_inputs(self, params: cfg.DataConfig, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) x = dict( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) # Include some label_id as -1, which will be ignored in loss/metrics. y = tf.random.uniform( shape=(1, params.seq_length), minval=-1, maxval=len(self.task_config.class_names), dtype=tf.dtypes.int32) return (x, y) dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset return data_loader_factory.get_data_loader(params).load(input_context)
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for pretraining.""" # copy from masked_lm.py for testing if params.input_path == 'dummy': def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) dummy_lm = tf.zeros((1, params.max_predictions_per_seq), dtype=tf.int32) return dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids, masked_lm_positions=dummy_lm, masked_lm_ids=dummy_lm, masked_lm_weights=tf.cast(dummy_lm, dtype=tf.float32), next_sentence_labels=tf.zeros((1, 1), dtype=tf.int32)) dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset return data_loader_factory.get_data_loader(params).load(input_context)
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': # Dummy training data for unit test. def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) x = dict( input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) y = dict( start_positions=tf.constant(0, dtype=tf.int32), end_positions=tf.constant(1, dtype=tf.int32)) return (x, y) dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset if params.is_training: dataloader_params = params else: input_path = self._tf_record_input_path dataloader_params = params.replace(input_path=input_path) return data_loader_factory.get_data_loader(dataloader_params).load( input_context)
def build_dataset(self): """Creates the training and evaluation dataset.""" # Returns None when the input_path is 'dummy'. if self.train_dataset_config.input_path == 'dummy': self.train_dataset = None self.eval_dataset = None return # None distributed dataset. train_dataset = data_loader_factory.get_data_loader( self.train_dataset_config).load() eval_dataset = data_loader_factory.get_data_loader( self.eval_dataset_config).load() # Ddistributed dataset. self.train_dataset = orbit.utils.make_distributed_dataset( self.strategy, train_dataset) self.eval_dataset = orbit.utils.make_distributed_dataset( self.strategy, eval_dataset)
def build_inputs(self, params: cfg.DataConfig, input_context: Optional[tf.distribute.InputContext] = None): """Returns a dataset.""" if params.is_training: dataloader_params = params else: input_path = self._tf_record_input_path # Read from padded tf records instead. dataloader_params = params.replace( input_path=input_path, tfds_name="", tfds_split="", has_unique_id=True) dataloader_params = dataloader_params.replace( sentencepiece_model_path=self._sentencepiece_model_path) return data_loader_factory.get_data_loader(dataloader_params).load( input_context)
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dummy_data = functools.partial(self._dummy_data, params) dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset if params.is_training: dataloader_params = params else: input_path = self._tf_record_input_path dataloader_params = params.replace(input_path=input_path) return data_loader_factory.get_data_loader(dataloader_params).load( input_context)
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) x = dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) y = tf.zeros((1, 1), dtype=tf.int32) return (x, y) dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset return data_loader_factory.get_data_loader(params).load(input_context)
def write_test_record(params, model_dir): """Writes the test input to a tfrecord.""" # Get raw data from tfds. params = params.replace(transform_and_batch=False) dataset = data_loader_factory.get_data_loader(params).load() references = [] total_samples = 0 output_file = os.path.join(model_dir, "eval.tf_record") writer = tf.io.TFRecordWriter(output_file) for d in dataset: references.append(d[params.tgt_lang].numpy().decode()) example = tf.train.Example( features=tf.train.Features( feature={ "unique_id": tf.train.Feature( int64_list=tf.train.Int64List(value=[total_samples])), params.src_lang: tf.train.Feature( bytes_list=tf.train.BytesList( value=[d[params.src_lang].numpy()])), params.tgt_lang: tf.train.Feature( bytes_list=tf.train.BytesList( value=[d[params.tgt_lang].numpy()])), })) writer.write(example.SerializeToString()) total_samples += 1 batch_size = params.global_batch_size num_dummy_example = batch_size - total_samples % batch_size for i in range(num_dummy_example): example = tf.train.Example( features=tf.train.Features( feature={ "unique_id": tf.train.Feature( int64_list=tf.train.Int64List(value=[total_samples + i])), params.src_lang: tf.train.Feature( bytes_list=tf.train.BytesList(value=[b""])), params.tgt_lang: tf.train.Feature( bytes_list=tf.train.BytesList(value=[b""])), })) writer.write(example.SerializeToString()) writer.close() return references, output_file
def build_inputs(self, params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" if params.input_path == 'dummy': def dummy_data(_): dummy_ids = tf.zeros((1, params.seq_length), dtype=tf.int32) x = dict(input_word_ids=dummy_ids, input_mask=dummy_ids, input_type_ids=dummy_ids) if self.task_config.model.num_classes == 1: y = tf.zeros((1, ), dtype=tf.float32) else: y = tf.zeros((1, 1), dtype=tf.int32) x[self.label_field] = y return x dataset = tf.data.Dataset.range(1) dataset = dataset.repeat() dataset = dataset.map( dummy_data, num_parallel_calls=tf.data.experimental.AUTOTUNE) return dataset return data_loader_factory.get_data_loader(params).load(input_context)
def build_inputs(data_params, input_context=None): """Returns tf.data.Dataset for sentence_prediction task.""" return data_loader_factory.get_data_loader(data_params).load(input_context)
def test_register_and_load(self): train_config = MyDataConfig() train_loader = data_loader_factory.get_data_loader(train_config) self.assertTrue(train_loader.params.is_training)