Example #1
0
def tokens_to_batches(dataset, sequence_length, batch_size, output_features):
    """Convert a dataset of token sequences to batches of padded/masked examples.

  Args:
    dataset: tf.data.Dataset containing examples with token sequences.
    sequence_length: dict of int, a dict mapping feature name to length.
    batch_size: int, the number of padded sequences in each batch.
    output_features: list of str, features to include in the dataset.

  Returns:
    A generator that produces batches of numpy examples.
  """
    dataset = transformer_dataset.pack_or_pad(
        dataset,
        sequence_length,
        pack=False,
        feature_keys=output_features,
        ensure_eos=True,
    )

    def _map_fn(ex):
        for key in output_features:
            tensor = ex[key]
            mask = tf.cast(tf.greater(tensor, 0), tensor.dtype)
            ex[key + "_mask"] = mask
        return ex

    dataset = dataset.map(
        _map_fn, num_parallel_calls=t5.data.preprocessors.num_parallel_calls())

    dataset = dataset.batch(batch_size, drop_remainder=False)
    return tfds.as_numpy(dataset)
Example #2
0
    def _get_dataset_for_single_task(task, sequence_length):
        """Get a tensorflow.data.Dataset for the provided task."""
        if shuffle_eval_examples and seed is None:
            logging.warning(("shuffle_seed_examples is true but no seed was ",
                             "provided. Using a random seed."))

        ds = task.get_dataset(
            sequence_length,
            split=dataset_split,
            use_cached=use_cached,
            shuffle=shuffle_eval_examples,
            seed=seed,
        )
        eos_keys = set(k for k, f in mixture_or_task.output_features.items()
                       if f.add_eos)
        if sequence_length is None:
            logging.info(
                "Skipping packing/padding for '%s' since sequence length is None.",
                task.name)
        else:
            logging.info("%sing '%s' with sequence lengths: %s",
                         "Pack" if pack else "Padd", task.name,
                         sequence_length)
            ds = transformer_dataset.pack_or_pad(ds,
                                                 sequence_length,
                                                 pack=pack,
                                                 feature_keys=tuple(
                                                     task.output_features),
                                                 ensure_eos=eos_keys)

        if num_eval_examples is not None and num_eval_examples >= 0:
            ds = ds.take(num_eval_examples)

        return ds
 def _get_dataset_for_single_task(task, sequence_length):
     """Get a tensorflow.data.Dataset for the provided task."""
     ds = task.get_dataset(sequence_length,
                           split=dataset_split,
                           use_cached=use_cached,
                           shuffle=False)
     eos_keys = set(k for k, f in mixture_or_task.output_features.items()
                    if f.add_eos)
     if sequence_length is None:
         tf.logging.info(
             "Skipping packing/padding for '%s' since sequence length is None.",
             task.name)
     else:
         tf.logging.info("%sing '%s' with sequence lengths: %s",
                         "Pack" if pack else "Padd", task.name,
                         sequence_length)
         ds = transformer_dataset.pack_or_pad(ds,
                                              sequence_length,
                                              pack=pack,
                                              feature_keys=tuple(
                                                  task.output_features),
                                              ensure_eos=eos_keys)
     ds = maybe_shuffle_and_subsample_dataset(ds, num_eval_examples,
                                              shuffle_eval_examples,
                                              shuffle_buffer_size)
     return ds
Example #4
0
def mesh_train_dataset_fn(
    mixture_or_task_name,
    sequence_length,
    vocabulary,
    dataset_split=tfds.Split.TRAIN,
    use_cached=False):
  """Returns the tf.data.Dataset for training on a given mixture.

  This uses the format required for utils.run's `train_dataset_fn` argument in
  the Mesh TF transformer standalone.

  Args:
    mixture_or_task_name: string, an identifier for a Mixture or Task in the
      appropriate registry. Must be specified via gin.
    sequence_length: dict mapping feature key to the int length for that feature
      the max sequence length.
    vocabulary: a SentencePieceVocabulary.
    dataset_split: string, which split of the dataset to load. In most cases
      this should be "train".
    use_cached: bool, whether to load the cached version of this dataset.

  Returns:
    A tf.data.Dataset of preprocessed, tokenized, and batched examples.
  """
  if not isinstance(vocabulary, data.SentencePieceVocabulary):
    raise ValueError("vocabulary must be a SentencePieceVocabulary")

  mixture_or_task = data.get_mixture_or_task(mixture_or_task_name)

  ds = mixture_or_task.get_dataset(
      sequence_length, split=dataset_split, use_cached=use_cached, shuffle=True)
  ds = transformer_dataset.pack_or_pad(
      ds, sequence_length, pack=True,
      feature_keys=tuple(mixture_or_task.output_features), ensure_eos=True)
  return ds
Example #5
0
def mesh_train_dataset_fn(mixture_or_task_name,
                          sequence_length,
                          vocabulary=None,
                          dataset_split=tfds.Split.TRAIN,
                          seed=None,
                          use_cached=False,
                          pack=True):
    """Returns the tf.data.Dataset for training on a given mixture.

  This uses the format required for utils.run's `train_dataset_fn` argument in
  the Mesh TF transformer standalone.

  Args:
    mixture_or_task_name: string, an identifier for a Mixture or Task in the
      appropriate registry. Must be specified via gin.
    sequence_length: dict mapping feature key to the int length for that feature
      the max sequence length.
    vocabulary: unused argument, maintains compatibility with other dataset_fns.
    dataset_split: string, which split of the dataset to load. In most cases
      this should be "train".
    seed: tf.int64 scalar tf.Tensor (or None). Used for both the global seed and
      shuffle seed for tf.data
    use_cached: bool, whether to load the cached version of this dataset.
    pack: bool, whether to pack the dataset.

  Returns:
    A tf.data.Dataset of preprocessed, tokenized, and batched examples.
  """
    del vocabulary
    mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name)

    ds = mixture_or_task.get_dataset(sequence_length,
                                     split=dataset_split,
                                     use_cached=use_cached,
                                     shuffle=True,
                                     seed=seed)

    # Select just the output features which are present in the dataset.
    feature_keys = tuple(k for k in mixture_or_task.output_features
                         if k in tf.data.get_output_shapes(ds))

    # Filtering feature keys is done in pack_or_pad function. However, when
    # packing is turned off, input_features aren't filtered leading to training
    # problems due to strings showing up in the input example. Filtering features
    # ensures that we don't rely on pack_or_pad to filter features for training.
    def _filter_features(ex):
        return {k: ex[k] for k in feature_keys}

    ds = ds.map(_filter_features,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

    eos_keys = set(k for k, f in mixture_or_task.output_features.items()
                   if f.add_eos)
    ds = transformer_dataset.pack_or_pad(ds,
                                         sequence_length,
                                         pack=pack,
                                         feature_keys=feature_keys,
                                         ensure_eos=eos_keys)
    return ds
Example #6
0
 def _get_dataset_for_single_task(task):
   """Get a tensorflow.data.Dataset for the provided task."""
   ds = task.get_dataset(
       sequence_length, split=dataset_split,
       use_cached=use_cached, shuffle=False
   )
   ds = transformer_dataset.pack_or_pad(
       ds, sequence_length, pack=False, feature_keys=task.output_features,
       ensure_eos=True)
   if num_eval_examples is not None:
     ds = ds.take(num_eval_examples)
   return ds
def mesh_train_dataset_fn(mixture_or_task_name,
                          sequence_length,
                          vocabulary,
                          dataset_split=tfds.Split.TRAIN,
                          seed=None,
                          use_cached=False):
    """Returns the tf.data.Dataset for training on a given mixture.

  This uses the format required for utils.run's `train_dataset_fn` argument in
  the Mesh TF transformer standalone.

  Args:
    mixture_or_task_name: string, an identifier for a Mixture or Task in the
      appropriate registry. Must be specified via gin.
    sequence_length: dict mapping feature key to the int length for that feature
      the max sequence length.
    vocabulary: a t5.data.vocabularies.Vocabulary.
    dataset_split: string, which split of the dataset to load. In most cases
      this should be "train".
    seed: tf.int64 scalar tf.Tensor (or None). Used for both the global seed and
      shuffle seed for tf.data
    use_cached: bool, whether to load the cached version of this dataset.

  Returns:
    A tf.data.Dataset of preprocessed, tokenized, and batched examples.
  """
    valid_vocabulary(vocabulary)

    mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name)

    ds = mixture_or_task.get_dataset(sequence_length,
                                     split=dataset_split,
                                     use_cached=use_cached,
                                     shuffle=True,
                                     seed=seed)

    # Select just the output features which are present in the dataset.
    feature_keys = tuple(k for k in mixture_or_task.output_features
                         if k in tf.data.get_output_shapes(ds))
    eos_keys = set(k for k, f in mixture_or_task.output_features.items()
                   if f.add_eos)
    ds = transformer_dataset.pack_or_pad(ds,
                                         sequence_length,
                                         pack=True,
                                         feature_keys=feature_keys,
                                         ensure_eos=eos_keys)
    return ds
Example #8
0
def tokens_to_batches(dataset,
                      sequence_length,
                      batch_size,
                      output_features,
                      mixture_or_task=None):
  """Convert a dataset of token sequences to batches of padded/masked examples.

  Args:
    dataset: tf.data.Dataset containing examples with token sequences.
    sequence_length: dict of int, a dict mapping feature name to length.
    batch_size: int, the number of padded sequences in each batch.
    output_features: list of str, features to include in the dataset.
    mixture_or_task: a Task or Mixture object, used to correctly specify eos if
      provided. If none, eos is always added at the end of the sequence.

  Returns:
    A generator that produces batches of numpy examples.
  """

  if mixture_or_task:
    eos_keys = set(
        k for k, f in mixture_or_task.output_features.items() if f.add_eos)
  else:
    eos_keys = True

  dataset = transformer_dataset.pack_or_pad(
      dataset,
      sequence_length,
      pack=False,
      feature_keys=output_features,
      ensure_eos=eos_keys,
  )

  def _map_fn(ex):
    for key in output_features:
      tensor = ex[key]
      mask = tf.cast(tf.greater(tensor, 0), tensor.dtype)
      ex[key + "_mask"] = mask
    return ex

  dataset = dataset.map(
      _map_fn,
      num_parallel_calls=tf.data.experimental.AUTOTUNE,
  )

  dataset = dataset.batch(batch_size, drop_remainder=False)
  return tfds.as_numpy(dataset)
 def _get_dataset_for_single_task(task):
   """Get a tensorflow.data.Dataset for the provided task."""
   ds = task.get_dataset(
       sequence_length, split=dataset_split,
       use_cached=use_cached, shuffle=False
   )
   eos_keys = set(
       k for k, f in mixture_or_task.output_features.items() if f.add_eos)
   ds = transformer_dataset.pack_or_pad(
       ds,
       sequence_length,
       pack=pack,
       feature_keys=tuple(task.output_features),
       ensure_eos=eos_keys)
   ds = maybe_shuffle_and_subsample_dataset(
       ds, num_eval_examples, shuffle_eval_examples, shuffle_buffer_size)
   return ds
Example #10
0
 def _get_dataset_for_single_task(task):
     """Get a tensorflow.data.Dataset for the provided task."""
     ds = task.get_dataset(sequence_length,
                           split=dataset_split,
                           use_cached=use_cached,
                           shuffle=False)
     if any(not f.add_eos for f in task.output_features.values()):
         warnings.warn(
             "pack_or_pad is being called with ensure_eos=True, but EOS is not "
             "being added to all features.")
     ds = transformer_dataset.pack_or_pad(ds,
                                          sequence_length,
                                          pack=False,
                                          feature_keys=tuple(
                                              task.output_features),
                                          ensure_eos=True)
     if num_eval_examples is not None:
         ds = ds.take(num_eval_examples)
     return ds
Example #11
0
def mesh_train_dataset_fn(mixture_or_task_name,
                          sequence_length,
                          vocabulary,
                          dataset_split=tfds.Split.TRAIN,
                          use_cached=False):
    """Returns the tf.data.Dataset for training on a given mixture.

  This uses the format required for utils.run's `train_dataset_fn` argument in
  the Mesh TF transformer standalone.

  Args:
    mixture_or_task_name: string, an identifier for a Mixture or Task in the
      appropriate registry. Must be specified via gin.
    sequence_length: dict mapping feature key to the int length for that feature
      the max sequence length.
    vocabulary: a t5.data.vocabularies.Vocabulary.
    dataset_split: string, which split of the dataset to load. In most cases
      this should be "train".
    use_cached: bool, whether to load the cached version of this dataset.

  Returns:
    A tf.data.Dataset of preprocessed, tokenized, and batched examples.
  """
    valid_vocabulary(vocabulary)

    mixture_or_task = t5.data.get_mixture_or_task(mixture_or_task_name)

    ds = mixture_or_task.get_dataset(sequence_length,
                                     split=dataset_split,
                                     use_cached=use_cached,
                                     shuffle=True)
    if any(not f.add_eos for f in mixture_or_task.output_features.values()):
        warnings.warn(
            "pack_or_pad is being called with ensure_eos=True, but EOS is not "
            "being added to all features.")
    ds = transformer_dataset.pack_or_pad(ds,
                                         sequence_length,
                                         pack=True,
                                         feature_keys=tuple(
                                             mixture_or_task.output_features),
                                         ensure_eos=True)
    return ds
Example #12
0
    def _get_dataset_for_single_task(task, sequence_length):
        """Get a tensorflow.data.Dataset for the provided task."""

        ds = task.get_dataset(sequence_length,
                              split=dataset_split,
                              use_cached=use_cached,
                              shuffle=shuffle,
                              seed=seed)
        if "inputs" not in ds.element_spec:
            if not priming_sequence_length or priming_sequence_length <= 0:
                logging.warning(
                    "Priming sequence length not specified so priming "
                    "with the empty string.")
                ds = ds.map(_prepare_for_unprimed_inference)
            else:
                logging.info(
                    "Using the first %d tokens of each target as input.",
                    priming_sequence_length)
                ds = ds.map(_split_targets_for_primed_inference)
        elif priming_sequence_length is not None:
            raise ValueError(
                "Setting a priming sequence length only makes sense for decoder-only "
                "Tasks, which have `targets` but no `inputs`.")

        eos_keys = set(k for k, f in mixture_or_task.output_features.items()
                       if f.add_eos)

        logging.info("Padding '%s' with sequence lengths: %s", task.name,
                     sequence_length)
        ds = transformer_dataset.pack_or_pad(ds,
                                             sequence_length,
                                             pack=False,
                                             feature_keys=tuple(
                                                 task.output_features),
                                             ensure_eos=eos_keys)

        if num_inference_examples is not None and num_inference_examples >= 0:
            ds = ds.take(num_inference_examples)

        return ds
Example #13
0
#!/usr/bin/env python3
import t5
import tensorflow as tf
import mesh_tensorflow.transformer.dataset as transformer_dataset
import itertools

tf.enable_eager_execution()
task = t5.data.get_mixture_or_task("glue_cola_v002")
ds = task.get_dataset({"inputs": 64, "targets": 8}, "train")

ds = transformer_dataset.pack_or_pad(
    ds,
    {
        "inputs": 64,
        "targets": 8
    },
    pack=False,
    feature_keys=tuple(task.output_features),
    ensure_eos=True,
)


def add_attention_masks(ds, feature_keys):
    def _map_fn(ex):
        for key in feature_keys:
            tensor = ex[key]
            mask = tf.cast(tf.greater(tensor, 0), tensor.dtype)
            ex[key + "_mask"] = mask
        return ex

    return ds.map(