Example #1
0
def token_noise(dataset, output_features, **unused_kwargs):
    return preprocessors.denoise(
        dataset,
        output_features,
        noise_density=0.15,
        noise_mask_fn=preprocessors.iid_noise_mask,
        inputs_fn=preprocessors.noise_token_to_sentinel,
        targets_fn=None)
  def test_denoise(self):
    tf.set_random_seed(55)

    vocab = test_utils.sentencepiece_vocab()
    target_tokens = vocab.encode('The quick brown fox.')

    # This is what it encodes to.
    self.assertEqual(
        target_tokens,
        [3, 2, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 22, 3, 2, 7, 2])

    og_dataset = tf.data.Dataset.from_tensor_slices({
        'targets': [target_tokens],
    })

    output_features = {
        'targets': utils.Feature(vocab),
    }

    # These are the parameters of denoise in the operative config of 'base'.
    # Except noise_density, bumped up from 0.15 to 0.3 in order to demonstrate
    # multiple corrupted spans.
    denoised_dataset = prep.denoise(
        og_dataset,
        output_features,
        noise_density=0.3,
        noise_mask_fn=prep.random_spans_noise_mask,
        inputs_fn=prep.noise_span_to_unique_sentinel,
        targets_fn=prep.nonnoise_span_to_unique_sentinel)

    # Two spans corrupted, [2] and [22, 3, 2, 7, 2], replaced by unique
    # sentinels 25 and 24 respectively.
    assert_dataset(denoised_dataset, [
        {
            'inputs': [
                3, 25, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 24
            ],
            'targets': [
                25, 2, 24, 22, 3, 2, 7, 2
            ],
        },
    ])