Example #1
0
    def test_rekey(self):
        og_dataset = tf.data.Dataset.from_tensors({
            'text': 'That is good.',
            'other': 'That is bad.'
        })
        dataset = prep.rekey(og_dataset, {
            'inputs': 'other',
            'targets': 'text'
        })
        assert_dataset(dataset, {
            'inputs': 'That is bad.',
            'targets': 'That is good.'
        })

        dataset = prep.rekey(og_dataset, {'targets': 'text'})
        assert_dataset(dataset, {'targets': 'That is good.'})

        dataset = prep.rekey(og_dataset, {'inputs': 'text'})
        assert_dataset(dataset, {'inputs': 'That is good.'})

        dataset = prep.rekey(og_dataset)
        assert_dataset(dataset, {
            'text': 'That is good.',
            'other': 'That is bad.'
        })

        dataset = prep.rekey(og_dataset, {'inputs': 'text', 'targets': None})
        assert_dataset(dataset, {'inputs': 'That is good.', 'targets': ''})
Example #2
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    return dataset
Example #3
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_data.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    keys = output_features

    def encode_string_features_fn(features):
        """Encode all specified feature that are strings and return a dictionary.

    Args:
      features: a dictionary
    Returns:
      a dictionary
    """
        ret = {}
        for k, v in features.items():
            if k in keys and v.dtype == tf.string:
                if copy_plaintext:
                    ret['%s_plaintext' % k] = v
                v = tf.cast(output_features[k].vocabulary.encode_tf(v),
                            tf.int64)
            ret[k] = v
        return ret

    dataset = dataset.map(encode_string_features_fn,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    # Truncate and then pad the examples -- all examples have the same shape.
    dataset = truncate_dataset_on_len(dataset, training, sequence_length, True)
    dataset = pad_dataset_to_length(dataset, training, sequence_length)

    return dataset