def test_optional_features(self): def _dummy_preprocessor(output): return lambda _: tf.data.Dataset.from_tensors(output) default_vocab = test_utils.sentencepiece_vocab() features = { "inputs": utils.Feature(vocabulary=default_vocab, required=False), "targets": utils.Feature(vocabulary=default_vocab, required=True), } test_utils.add_task("text_missing_optional_feature", test_utils.get_fake_dataset, output_features=features, text_preprocessor=_dummy_preprocessor( {"targets": "a"})) TaskRegistry.get_dataset("text_missing_optional_feature", {"targets": 13}, "train", use_cached=False) test_utils.add_task("text_missing_required_feature", test_utils.get_fake_dataset, output_features=features, text_preprocessor=_dummy_preprocessor( {"inputs": "a"})) with self.assertRaisesRegex( ValueError, "Task dataset is missing expected output feature after text " "preprocessing: targets"): TaskRegistry.get_dataset("text_missing_required_feature", {"inputs": 13}, "train", use_cached=False)
def test_no_eos(self): features = { "inputs": utils.Feature(add_eos=True), "targets": utils.Feature(add_eos=False), } test_utils.add_task("task_no_eos", test_utils.get_fake_dataset, output_features=features) fn_task = TaskRegistry.get("task_no_eos") test_utils.verify_task_matches_fake_datasets(fn_task, use_cached=False)
def test_no_eos(self): default_vocab = test_utils.sentencepiece_vocab() features = { "inputs": utils.Feature(add_eos=True, vocabulary=default_vocab), "targets": utils.Feature(add_eos=False, vocabulary=default_vocab), } test_utils.add_task("task_no_eos", test_utils.get_fake_dataset, output_features=features) fn_task = TaskRegistry.get("task_no_eos") test_utils.verify_task_matches_fake_datasets(fn_task, use_cached=False)
def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) return dataset
def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. keys = output_features def encode_string_features_fn(features): """Encode all specified feature that are strings and return a dictionary. Args: features: a dictionary Returns: a dictionary """ ret = {} for k, v in features.items(): if k in keys and v.dtype == tf.string: if copy_plaintext: ret['%s_plaintext' % k] = v v = tf.cast(output_features[k].vocabulary.encode_tf(v), tf.int64) ret[k] = v return ret dataset = dataset.map(encode_string_features_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) # Truncate and then pad the examples -- all examples have the same shape. dataset = truncate_dataset_on_len(dataset, training, sequence_length, True) dataset = pad_dataset_to_length(dataset, training, sequence_length) return dataset
def add_tfds_task(name, tfds_name="fake:0.0.0", text_preprocessor=test_text_preprocessor, token_preprocessor=None, splits=None): TaskRegistry.add(name, dataset_utils.TfdsTask, tfds_name=tfds_name, text_preprocessor=text_preprocessor, token_preprocessor=token_preprocessor, output_features=dataset_utils.Feature( sentencepiece_vocab()), metric_fns=[], splits=splits)
def add_task(name, dataset_fn, text_preprocessor=test_text_preprocessor, token_preprocessor=None, splits=("train", "validation"), **kwargs): if "output_features" not in kwargs: kwargs["output_features"] = dataset_utils.Feature( sentencepiece_vocab()) TaskRegistry.add(name, dataset_fn=dataset_fn, splits=splits, text_preprocessor=text_preprocessor, token_preprocessor=token_preprocessor, metric_fns=[], **kwargs)
def test_denoise(self): tf.set_random_seed(55) vocab = test_utils.sentencepiece_vocab() target_tokens = vocab.encode('The quick brown fox.') # This is what it encodes to. self.assertEqual( target_tokens, [3, 2, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 22, 3, 2, 7, 2]) og_dataset = tf.data.Dataset.from_tensor_slices({ 'targets': [target_tokens], }) output_features = { 'targets': utils.Feature(vocab), } # These are the parameters of denoise in the operative config of 'base'. # Except noise_density, bumped up from 0.15 to 0.3 in order to demonstrate # multiple corrupted spans. denoised_dataset = prep.denoise( og_dataset, output_features, noise_density=0.3, noise_mask_fn=prep.random_spans_noise_mask, inputs_fn=prep.noise_span_to_unique_sentinel, targets_fn=prep.nonnoise_span_to_unique_sentinel) # Two spans corrupted, [2] and [22, 3, 2, 7, 2], replaced by unique # sentinels 25 and 24 respectively. assert_dataset(denoised_dataset, [ { 'inputs': [ 3, 25, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 24 ], 'targets': [ 25, 2, 24, 22, 3, 2, 7, 2 ], }, ])
def generic_text_dataset_preprocess_fn(dataset, text_preprocess_fn=None, spm_path=None, copy_plaintext=False): """Applies a text preprocess fn and tokenizes the dataset.""" # The assumption is that `text_preprocess_fn` finally gives us a dataset # which has `inputs` and `targets`. if text_preprocess_fn is not None: dataset = text_preprocess_fn(dataset) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the inputs and targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) return dataset
def generic_text_dataset_preprocess_fn(dataset, training=True, text_preprocess_fns=None, token_preprocess_fns=None, spm_path=None, copy_plaintext=False, debug_print_examples=False, debug_print_examples_rate=0.01): """Pre-processes, tokenizes and post-processes a `tf.data.Dataset`. Args: dataset: `tf.data.Dataset` to process. training: boolean, set to True if training, False otherwise. text_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset` this operates before tokenization. Typically used to select which fields we want to learn over or change something into "text to text" form. token_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool -> `tf.data.Dataset`, this operates after tokenization. Since this can view the tokenized fields, this can be used to filter on length etc. spm_path: None or str, path to a sentencepiece model to use for tokenization by default uses the 32k vocabulary from T5. copy_plaintext: bool, if True retains the original fields after tokenization. debug_print_examples: bool, if True this prints examples to the logging stream for inspection, both before and after tokenization. debug_print_examples_rate: float, [0, 1.0], on average this fraction of dataset examples will be printed out in each phase i.e. pre and post tokenization. Returns: a `tf.data.Dataset` with all the preprocessing and tokenization performed. """ # The assumption is that `text_preprocess_fns` finally gives us a dataset # which has `inputs` and `targets`. if text_preprocess_fns is not None: for text_preprocess_fn in text_preprocess_fns: dataset = text_preprocess_fn(dataset, training) # Print debugging examples if needed before tokenization. if debug_print_examples: def print_examples(x): if np.random.uniform() < debug_print_examples_rate: tf.print(x, output_stream=logging.info) return x dataset = dataset.map(print_examples) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the inputs and targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) # Apply the token-preprocessors. if token_preprocess_fns is not None: for token_preprocess_fn in token_preprocess_fns: dataset = token_preprocess_fn(dataset, training) if debug_print_examples: def print_examples_and_shapes(x): if np.random.uniform() < debug_print_examples_rate: tf.print( { 'inputs_shape': tf.size(x['inputs']), 'targets_shape': tf.size(x['targets']), 'inputs': x['inputs'], 'targets': x['targets'], }, output_stream=logging.info) return x dataset = dataset.map(print_examples_and_shapes) return dataset
def setUp(self): super().setUp() self.maxDiff = None # pylint:disable=invalid-name # Mock TFDS # Note we don't use mock.Mock since they fail to pickle. fake_tfds_paths = { "train": [ { # pylint:disable=g-complex-comprehension "filename": "train.tfrecord-%05d-of-00002" % i, "skip": 0, "take": -1 } for i in range(2)], "validation": [ { "filename": "validation.tfrecord-00000-of-00001", "skip": 0, "take": -1 }], } def _load_shard(shard_instruction): fname = shard_instruction["filename"] if "train" in fname: if fname.endswith("00000-of-00002"): return get_fake_dataset("train").take(2) else: return get_fake_dataset("train").skip(2) else: return get_fake_dataset("validation") fake_tfds = FakeLazyTfds( name="fake:0.0.0", load=get_fake_dataset, load_shard=_load_shard, info=FakeTfdsInfo(splits={"train": None, "validation": None}), files=fake_tfds_paths.get, size=lambda x: 30 if x == "train" else 10) self._tfds_patcher = mock.patch( "t5.data.utils.LazyTfdsLoader", new=mock.Mock(return_value=fake_tfds)) self._tfds_patcher.start() # Set up data directory. self.test_tmpdir = self.get_tempdir() self.test_data_dir = os.path.join(self.test_tmpdir, "test_data") shutil.copytree(TEST_DATA_DIR, self.test_data_dir) for root, dirs, _ in os.walk(self.test_data_dir): for d in dirs + [""]: os.chmod(os.path.join(root, d), 0o777) # Register a cached test Task. dataset_utils.set_global_cache_dirs([self.test_data_dir]) clear_tasks() add_tfds_task("cached_task") # Prepare cached task. self.cached_task = TaskRegistry.get("cached_task") cached_task_dir = os.path.join(self.test_data_dir, "cached_task") _dump_fake_dataset( os.path.join(cached_task_dir, "train.tfrecord"), _FAKE_TOKENIZED_DATASET["train"], [2, 1], _dump_examples_to_tfrecord) _dump_fake_dataset( os.path.join(cached_task_dir, "validation.tfrecord"), _FAKE_TOKENIZED_DATASET["validation"], [2], _dump_examples_to_tfrecord) # Prepare uncached TfdsTask. add_tfds_task("uncached_task") self.uncached_task = TaskRegistry.get("uncached_task") # Prepare uncached TextLineTask. _dump_fake_dataset( os.path.join(self.test_data_dir, "train.tsv"), _FAKE_DATASET["train"], [2, 1], _dump_examples_to_tsv) TaskRegistry.add( "text_line_task", dataset_utils.TextLineTask, split_to_filepattern={ "train": os.path.join(self.test_data_dir, "train.tsv*"), }, skip_header_lines=1, text_preprocessor=[_split_tsv_preprocessor, test_text_preprocessor], output_features=dataset_utils.Feature(sentencepiece_vocab()), metric_fns=[]) self.text_line_task = TaskRegistry.get("text_line_task")