Python rekey Examples

Programming Language: Python

Namespace/Package Name: t5.data.preprocessors

Method/Function: rekey

Examples at hotexamples.com: 3

Python rekey - 3 examples found. These are the top rated real world Python examples of t5.data.preprocessors.rekey extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

    def test_rekey(self):
        og_dataset = tf.data.Dataset.from_tensors({
            'text': 'That is good.',
            'other': 'That is bad.'
        })
        dataset = prep.rekey(og_dataset, {
            'inputs': 'other',
            'targets': 'text'
        })
        assert_dataset(dataset, {
            'inputs': 'That is bad.',
            'targets': 'That is good.'
        })

        dataset = prep.rekey(og_dataset, {'targets': 'text'})
        assert_dataset(dataset, {'targets': 'That is good.'})

        dataset = prep.rekey(og_dataset, {'inputs': 'text'})
        assert_dataset(dataset, {'inputs': 'That is good.'})

        dataset = prep.rekey(og_dataset)
        assert_dataset(dataset, {
            'text': 'That is good.',
            'other': 'That is bad.'
        })

        dataset = prep.rekey(og_dataset, {'inputs': 'text', 'targets': None})
        assert_dataset(dataset, {'inputs': 'That is good.', 'targets': ''})

Example #2

Show file

File: tf_inputs.py Project: ulrikSebastienR/trax

def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    return dataset

Example #3

Show file

File: tf_inputs.py Project: wintersurvival/trax

def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_data.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    keys = output_features

    def encode_string_features_fn(features):
        """Encode all specified feature that are strings and return a dictionary.

    Args:
      features: a dictionary
    Returns:
      a dictionary
    """
        ret = {}
        for k, v in features.items():
            if k in keys and v.dtype == tf.string:
                if copy_plaintext:
                    ret['%s_plaintext' % k] = v
                v = tf.cast(output_features[k].vocabulary.encode_tf(v),
                            tf.int64)
            ret[k] = v
        return ret

    dataset = dataset.map(encode_string_features_fn,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    # Truncate and then pad the examples -- all examples have the same shape.
    dataset = truncate_dataset_on_len(dataset, training, sequence_length, True)
    dataset = pad_dataset_to_length(dataset, training, sequence_length)

    return dataset