def test_rekey(self): og_dataset = tf.data.Dataset.from_tensors({ 'text': 'That is good.', 'other': 'That is bad.' }) dataset = prep.rekey(og_dataset, { 'inputs': 'other', 'targets': 'text' }) assert_dataset(dataset, { 'inputs': 'That is bad.', 'targets': 'That is good.' }) dataset = prep.rekey(og_dataset, {'targets': 'text'}) assert_dataset(dataset, {'targets': 'That is good.'}) dataset = prep.rekey(og_dataset, {'inputs': 'text'}) assert_dataset(dataset, {'inputs': 'That is good.'}) dataset = prep.rekey(og_dataset) assert_dataset(dataset, { 'text': 'That is good.', 'other': 'That is bad.' }) dataset = prep.rekey(og_dataset, {'inputs': 'text', 'targets': None}) assert_dataset(dataset, {'inputs': 'That is good.', 'targets': ''})
def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_utils.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. dataset = t5_utils.encode_string_features(dataset, output_features, keys=output_features, copy_plaintext=copy_plaintext) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) return dataset
def c4_bare_preprocess_fn(dataset, training=True, spm_path=None, copy_plaintext=True, sequence_length=None): """Returns a dataset that contains 'inputs' and 'targets' from C4.""" # Set target key to be equal to the text content. dataset = t5_processors.rekey(dataset, key_map={ 'targets': 'text', 'inputs': None }) # Vocabulary for tokenization. vocab = t5_spc_vocab.SentencePieceVocabulary( sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH) feature = t5_data.Feature(vocab) output_features = {'targets': feature, 'inputs': feature} # Tokenize the targets. keys = output_features def encode_string_features_fn(features): """Encode all specified feature that are strings and return a dictionary. Args: features: a dictionary Returns: a dictionary """ ret = {} for k, v in features.items(): if k in keys and v.dtype == tf.string: if copy_plaintext: ret['%s_plaintext' % k] = v v = tf.cast(output_features[k].vocabulary.encode_tf(v), tf.int64) ret[k] = v return ret dataset = dataset.map(encode_string_features_fn, num_parallel_calls=tf.data.experimental.AUTOTUNE) # Preprocess the tokens - the exact preprocessors are set via gin. dataset = t5_processors.unsupervised(dataset, sequence_length=sequence_length, output_features=output_features) # Add EOS. dataset = add_eos_to_output_features(dataset, training) # Truncate and then pad the examples -- all examples have the same shape. dataset = truncate_dataset_on_len(dataset, training, sequence_length, True) dataset = pad_dataset_to_length(dataset, training, sequence_length) return dataset