Python unsupervised Examples

Programming Language: Python

Namespace/Package Name: t5.data.preprocessors

Method/Function: unsupervised

Examples at hotexamples.com: 3

Python unsupervised - 3 examples found. These are the top rated real world Python examples of t5.data.preprocessors.unsupervised extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: tf_inputs.py Project: ulrikSebastienR/trax

def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    return dataset

Example #2

Show file

File: tf_inputs.py Project: wintersurvival/trax

def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_data.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    keys = output_features

    def encode_string_features_fn(features):
        """Encode all specified feature that are strings and return a dictionary.

    Args:
      features: a dictionary
    Returns:
      a dictionary
    """
        ret = {}
        for k, v in features.items():
            if k in keys and v.dtype == tf.string:
                if copy_plaintext:
                    ret['%s_plaintext' % k] = v
                v = tf.cast(output_features[k].vocabulary.encode_tf(v),
                            tf.int64)
            ret[k] = v
        return ret

    dataset = dataset.map(encode_string_features_fn,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    # Truncate and then pad the examples -- all examples have the same shape.
    dataset = truncate_dataset_on_len(dataset, training, sequence_length, True)
    dataset = pad_dataset_to_length(dataset, training, sequence_length)

    return dataset

Example #3

Show file

 def test_denoise_nested_decorators(self):
     """Test whether gin and utils.map_over_dataset decorators are compatible."""
     bindings = """
   preprocessors.unsupervised.preprocessors = [@preprocessors.denoise]
   preprocessors.denoise.noise_density = 0.15
   preprocessors.denoise.noise_mask_fn = @preprocessors.iid_noise_mask
   preprocessors.denoise.inputs_fn = @noise_token_to_sentinel
 """
     gin.parse_config(bindings)
     og_dataset = tf.data.Dataset.from_tensor_slices({'targets': [1, 2, 3]})
     output_features = {
         'targets': Feature(test_utils.sentencepiece_vocab())
     }
     # Test denoise function when it is used as a gin-configurable of another
     # gin-configurable, prep.unsupervised.
     dataset = prep.unsupervised(og_dataset,
                                 output_features=output_features)
     self.assertIsInstance(dataset, tf.data.Dataset)