def token_noise(dataset, output_features, **unused_kwargs): return preprocessors.denoise( dataset, output_features, noise_density=0.15, noise_mask_fn=preprocessors.iid_noise_mask, inputs_fn=preprocessors.noise_token_to_sentinel, targets_fn=None)
def test_denoise(self): tf.set_random_seed(55) vocab = test_utils.sentencepiece_vocab() target_tokens = vocab.encode('The quick brown fox.') # This is what it encodes to. self.assertEqual( target_tokens, [3, 2, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 22, 3, 2, 7, 2]) og_dataset = tf.data.Dataset.from_tensor_slices({ 'targets': [target_tokens], }) output_features = { 'targets': utils.Feature(vocab), } # These are the parameters of denoise in the operative config of 'base'. # Except noise_density, bumped up from 0.15 to 0.3 in order to demonstrate # multiple corrupted spans. denoised_dataset = prep.denoise( og_dataset, output_features, noise_density=0.3, noise_mask_fn=prep.random_spans_noise_mask, inputs_fn=prep.noise_span_to_unique_sentinel, targets_fn=prep.nonnoise_span_to_unique_sentinel) # Two spans corrupted, [2] and [22, 3, 2, 7, 2], replaced by unique # sentinels 25 and 24 respectively. assert_dataset(denoised_dataset, [ { 'inputs': [ 3, 25, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 24 ], 'targets': [ 25, 2, 24, 22, 3, 2, 7, 2 ], }, ])