Example #1
0
    def test_optional_features(self):
        def _dummy_preprocessor(output):
            return lambda _: tf.data.Dataset.from_tensors(output)

        default_vocab = test_utils.sentencepiece_vocab()
        features = {
            "inputs": utils.Feature(vocabulary=default_vocab, required=False),
            "targets": utils.Feature(vocabulary=default_vocab, required=True),
        }

        test_utils.add_task("text_missing_optional_feature",
                            test_utils.get_fake_dataset,
                            output_features=features,
                            text_preprocessor=_dummy_preprocessor(
                                {"targets": "a"}))
        TaskRegistry.get_dataset("text_missing_optional_feature",
                                 {"targets": 13},
                                 "train",
                                 use_cached=False)

        test_utils.add_task("text_missing_required_feature",
                            test_utils.get_fake_dataset,
                            output_features=features,
                            text_preprocessor=_dummy_preprocessor(
                                {"inputs": "a"}))
        with self.assertRaisesRegex(
                ValueError,
                "Task dataset is missing expected output feature after text "
                "preprocessing: targets"):
            TaskRegistry.get_dataset("text_missing_required_feature",
                                     {"inputs": 13},
                                     "train",
                                     use_cached=False)
Example #2
0
 def test_no_eos(self):
     features = {
         "inputs": utils.Feature(add_eos=True),
         "targets": utils.Feature(add_eos=False),
     }
     test_utils.add_task("task_no_eos",
                         test_utils.get_fake_dataset,
                         output_features=features)
     fn_task = TaskRegistry.get("task_no_eos")
     test_utils.verify_task_matches_fake_datasets(fn_task, use_cached=False)
Example #3
0
 def test_no_eos(self):
     default_vocab = test_utils.sentencepiece_vocab()
     features = {
         "inputs": utils.Feature(add_eos=True, vocabulary=default_vocab),
         "targets": utils.Feature(add_eos=False, vocabulary=default_vocab),
     }
     test_utils.add_task("task_no_eos",
                         test_utils.get_fake_dataset,
                         output_features=features)
     fn_task = TaskRegistry.get("task_no_eos")
     test_utils.verify_task_matches_fake_datasets(fn_task, use_cached=False)
Example #4
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    return dataset
Example #5
0
def c4_bare_preprocess_fn(dataset,
                          training=True,
                          spm_path=None,
                          copy_plaintext=True,
                          sequence_length=None):
    """Returns a dataset that contains 'inputs' and 'targets' from C4."""
    # Set target key to be equal to the text content.
    dataset = t5_processors.rekey(dataset,
                                  key_map={
                                      'targets': 'text',
                                      'inputs': None
                                  })

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the targets.
    keys = output_features

    def encode_string_features_fn(features):
        """Encode all specified feature that are strings and return a dictionary.

    Args:
      features: a dictionary
    Returns:
      a dictionary
    """
        ret = {}
        for k, v in features.items():
            if k in keys and v.dtype == tf.string:
                if copy_plaintext:
                    ret['%s_plaintext' % k] = v
                v = tf.cast(output_features[k].vocabulary.encode_tf(v),
                            tf.int64)
            ret[k] = v
        return ret

    dataset = dataset.map(encode_string_features_fn,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Preprocess the tokens - the exact preprocessors are set via gin.
    dataset = t5_processors.unsupervised(dataset,
                                         sequence_length=sequence_length,
                                         output_features=output_features)

    # Add EOS.
    dataset = add_eos_to_output_features(dataset, training)

    # Truncate and then pad the examples -- all examples have the same shape.
    dataset = truncate_dataset_on_len(dataset, training, sequence_length, True)
    dataset = pad_dataset_to_length(dataset, training, sequence_length)

    return dataset
Example #6
0
def add_tfds_task(name,
                  tfds_name="fake:0.0.0",
                  text_preprocessor=test_text_preprocessor,
                  token_preprocessor=None,
                  splits=None):
    TaskRegistry.add(name,
                     dataset_utils.TfdsTask,
                     tfds_name=tfds_name,
                     text_preprocessor=text_preprocessor,
                     token_preprocessor=token_preprocessor,
                     output_features=dataset_utils.Feature(
                         sentencepiece_vocab()),
                     metric_fns=[],
                     splits=splits)
Example #7
0
def add_task(name,
             dataset_fn,
             text_preprocessor=test_text_preprocessor,
             token_preprocessor=None,
             splits=("train", "validation"),
             **kwargs):
    if "output_features" not in kwargs:
        kwargs["output_features"] = dataset_utils.Feature(
            sentencepiece_vocab())
    TaskRegistry.add(name,
                     dataset_fn=dataset_fn,
                     splits=splits,
                     text_preprocessor=text_preprocessor,
                     token_preprocessor=token_preprocessor,
                     metric_fns=[],
                     **kwargs)
  def test_denoise(self):
    tf.set_random_seed(55)

    vocab = test_utils.sentencepiece_vocab()
    target_tokens = vocab.encode('The quick brown fox.')

    # This is what it encodes to.
    self.assertEqual(
        target_tokens,
        [3, 2, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 22, 3, 2, 7, 2])

    og_dataset = tf.data.Dataset.from_tensor_slices({
        'targets': [target_tokens],
    })

    output_features = {
        'targets': utils.Feature(vocab),
    }

    # These are the parameters of denoise in the operative config of 'base'.
    # Except noise_density, bumped up from 0.15 to 0.3 in order to demonstrate
    # multiple corrupted spans.
    denoised_dataset = prep.denoise(
        og_dataset,
        output_features,
        noise_density=0.3,
        noise_mask_fn=prep.random_spans_noise_mask,
        inputs_fn=prep.noise_span_to_unique_sentinel,
        targets_fn=prep.nonnoise_span_to_unique_sentinel)

    # Two spans corrupted, [2] and [22, 3, 2, 7, 2], replaced by unique
    # sentinels 25 and 24 respectively.
    assert_dataset(denoised_dataset, [
        {
            'inputs': [
                3, 25, 20, 4, 3, 2, 8, 13, 2, 3, 2, 23, 7, 19, 24
            ],
            'targets': [
                25, 2, 24, 22, 3, 2, 7, 2
            ],
        },
    ])
Example #9
0
def generic_text_dataset_preprocess_fn(dataset,
                                       text_preprocess_fn=None,
                                       spm_path=None,
                                       copy_plaintext=False):
    """Applies a text preprocess fn and tokenizes the dataset."""

    # The assumption is that `text_preprocess_fn` finally gives us a dataset
    # which has `inputs` and `targets`.
    if text_preprocess_fn is not None:
        dataset = text_preprocess_fn(dataset)

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the inputs and targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    return dataset
Example #10
0
def generic_text_dataset_preprocess_fn(dataset,
                                       training=True,
                                       text_preprocess_fns=None,
                                       token_preprocess_fns=None,
                                       spm_path=None,
                                       copy_plaintext=False,
                                       debug_print_examples=False,
                                       debug_print_examples_rate=0.01):
    """Pre-processes, tokenizes and post-processes a `tf.data.Dataset`.

  Args:
    dataset: `tf.data.Dataset` to process.
    training: boolean, set to True if training, False otherwise.
    text_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool ->
      `tf.data.Dataset` this operates before tokenization. Typically used to
      select which fields we want to learn over or change something into
      "text to text" form.
    token_preprocess_fns: None or list of callables: `tf.data.Dataset`, bool ->
      `tf.data.Dataset`, this operates after tokenization. Since this can view
      the tokenized fields, this can be used to filter on length etc.
    spm_path: None or str, path to a sentencepiece model to use for tokenization
      by default uses the 32k vocabulary from T5.
    copy_plaintext: bool, if True retains the original fields after
      tokenization.
    debug_print_examples: bool, if True this prints examples to the logging
      stream for inspection, both before and after tokenization.
    debug_print_examples_rate: float, [0, 1.0], on average this fraction of
      dataset examples will be printed out in each phase i.e. pre and post
      tokenization.

  Returns:
    a `tf.data.Dataset` with all the preprocessing and tokenization performed.
  """

    # The assumption is that `text_preprocess_fns` finally gives us a dataset
    # which has `inputs` and `targets`.
    if text_preprocess_fns is not None:
        for text_preprocess_fn in text_preprocess_fns:
            dataset = text_preprocess_fn(dataset, training)

    # Print debugging examples if needed before tokenization.
    if debug_print_examples:

        def print_examples(x):
            if np.random.uniform() < debug_print_examples_rate:
                tf.print(x, output_stream=logging.info)
            return x

        dataset = dataset.map(print_examples)

    # Vocabulary for tokenization.
    vocab = t5_spc_vocab.SentencePieceVocabulary(
        sentencepiece_model_file=spm_path or t5_utils.DEFAULT_SPM_PATH)
    feature = t5_utils.Feature(vocab)
    output_features = {'targets': feature, 'inputs': feature}

    # Tokenize the inputs and targets.
    dataset = t5_utils.encode_string_features(dataset,
                                              output_features,
                                              keys=output_features,
                                              copy_plaintext=copy_plaintext)

    # Apply the token-preprocessors.
    if token_preprocess_fns is not None:
        for token_preprocess_fn in token_preprocess_fns:
            dataset = token_preprocess_fn(dataset, training)

    if debug_print_examples:

        def print_examples_and_shapes(x):
            if np.random.uniform() < debug_print_examples_rate:
                tf.print(
                    {
                        'inputs_shape': tf.size(x['inputs']),
                        'targets_shape': tf.size(x['targets']),
                        'inputs': x['inputs'],
                        'targets': x['targets'],
                    },
                    output_stream=logging.info)
            return x

        dataset = dataset.map(print_examples_and_shapes)

    return dataset
  def setUp(self):
    super().setUp()
    self.maxDiff = None  # pylint:disable=invalid-name

    # Mock TFDS
    # Note we don't use mock.Mock since they fail to pickle.
    fake_tfds_paths = {
        "train": [
            {  # pylint:disable=g-complex-comprehension
                "filename": "train.tfrecord-%05d-of-00002" % i,
                "skip": 0,
                "take": -1
            }
            for i in range(2)],
        "validation": [
            {
                "filename": "validation.tfrecord-00000-of-00001",
                "skip": 0,
                "take": -1
            }],
    }
    def _load_shard(shard_instruction):
      fname = shard_instruction["filename"]
      if "train" in fname:
        if fname.endswith("00000-of-00002"):
          return get_fake_dataset("train").take(2)
        else:
          return get_fake_dataset("train").skip(2)
      else:
        return get_fake_dataset("validation")

    fake_tfds = FakeLazyTfds(
        name="fake:0.0.0",
        load=get_fake_dataset,
        load_shard=_load_shard,
        info=FakeTfdsInfo(splits={"train": None, "validation": None}),
        files=fake_tfds_paths.get,
        size=lambda x: 30 if x == "train" else 10)
    self._tfds_patcher = mock.patch(
        "t5.data.utils.LazyTfdsLoader", new=mock.Mock(return_value=fake_tfds))
    self._tfds_patcher.start()

    # Set up data directory.
    self.test_tmpdir = self.get_tempdir()
    self.test_data_dir = os.path.join(self.test_tmpdir, "test_data")
    shutil.copytree(TEST_DATA_DIR, self.test_data_dir)
    for root, dirs, _ in os.walk(self.test_data_dir):
      for d in dirs + [""]:
        os.chmod(os.path.join(root, d), 0o777)

    # Register a cached test Task.
    dataset_utils.set_global_cache_dirs([self.test_data_dir])
    clear_tasks()
    add_tfds_task("cached_task")

    # Prepare cached task.
    self.cached_task = TaskRegistry.get("cached_task")
    cached_task_dir = os.path.join(self.test_data_dir, "cached_task")
    _dump_fake_dataset(
        os.path.join(cached_task_dir, "train.tfrecord"),
        _FAKE_TOKENIZED_DATASET["train"], [2, 1], _dump_examples_to_tfrecord)
    _dump_fake_dataset(
        os.path.join(cached_task_dir, "validation.tfrecord"),
        _FAKE_TOKENIZED_DATASET["validation"], [2], _dump_examples_to_tfrecord)

    # Prepare uncached TfdsTask.
    add_tfds_task("uncached_task")
    self.uncached_task = TaskRegistry.get("uncached_task")

    # Prepare uncached TextLineTask.
    _dump_fake_dataset(
        os.path.join(self.test_data_dir, "train.tsv"),
        _FAKE_DATASET["train"], [2, 1], _dump_examples_to_tsv)
    TaskRegistry.add(
        "text_line_task",
        dataset_utils.TextLineTask,
        split_to_filepattern={
            "train": os.path.join(self.test_data_dir, "train.tsv*"),
        },
        skip_header_lines=1,
        text_preprocessor=[_split_tsv_preprocessor, test_text_preprocessor],
        output_features=dataset_utils.Feature(sentencepiece_vocab()),
        metric_fns=[])
    self.text_line_task = TaskRegistry.get("text_line_task")