コード例 #1
0
ファイル: datasets.py プロジェクト: yynst2/deepmind-research
def _repeat_batch(batch_sizes: Sequence[int],
                  ds: tf.data.Dataset,
                  repeat: int = 1) -> tf.data.Dataset:
    """Tiles the inner most batch dimension."""
    if repeat <= 1:
        return ds
    if batch_sizes[-1] % repeat != 0:
        raise ValueError(
            f'The last element of `batch_sizes` ({batch_sizes}) must '
            f'be divisible by `repeat` ({repeat}).')
    # Perform regular batching with reduced number of elements.
    for i, batch_size in enumerate(reversed(batch_sizes)):
        ds = ds.batch(batch_size // repeat if i == 0 else batch_size,
                      drop_remainder=True)
    # Repeat batch.
    fn = lambda x: tf.repeat(x, repeats=repeat, axis=len(batch_sizes) - 1)

    def repeat_inner_batch(example):
        return jax.tree_map(fn, example)

    ds = ds.map(repeat_inner_batch, num_parallel_calls=tf.data.AUTOTUNE)
    # Unbatch.
    for _ in batch_sizes:
        ds = ds.unbatch()
    return ds
コード例 #2
0
def _pack_with_custom_ops(
        dataset: tf.data.Dataset,
        feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
    """Helper-function for packing a dataset which has already been batched.

  See trim_and_pack_dataset()

  Relies on custom ops which require a custom compiled binary.
  Faster than _pack_with_tf_ops(), and denser packing.

  Args:
    dataset: a dataset containing padded batches of examples.
    feature_lengths: mapping from feature key to packed length.

  Returns:
    a dataset.
  """
    # TODO(adarob): Move ops into this library and fix int64 issue.
    from tensor2tensor.data_generators.ops import pack_sequences_ops  # pylint: disable=g-import-not-at-top
    keys = list(feature_lengths)
    if len(keys) == 1:
        k1, = keys
        k2 = k1
    elif len(keys) == 2:
        k1, k2 = keys
    else:
        raise ValueError(f"Packing op requires 1 or 2 keys. Got {len(keys)}")

    def custom_pack_batch(x):
        """Map-function."""
        (k1_packed, k1_segment_ids, k1_positions, k2_packed, k2_segment_ids,
         k2_positions) = (
             pack_sequences_ops.pack_sequences2(
                 # cast to int64 for compatibility with custom ops
                 tf.cast(x[k1], tf.int64),
                 tf.cast(x[k2], tf.int64),
                 feature_lengths[k1],
                 feature_lengths[k2]))
        packed = {
            k1: k1_packed,
            k1 + "_segment_ids": k1_segment_ids,
            k1 + "_positions": k1_positions,
        }
        if len(keys) == 2:
            packed.update({
                k2: k2_packed,
                k2 + "_segment_ids": k2_segment_ids,
                k2 + "_positions": k2_positions,
            })

        # cast back to int32
        for k, v in packed.items():
            packed[k] = tf.cast(v, tf.int32)

        return packed

    dataset = dataset.map(custom_pack_batch,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.unbatch()
    return dataset
コード例 #3
0
    def _convert_features(
            self, ds: tf.data.Dataset,
            task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
        """Convert the input dataset to an output dataset to be fed to the model.

    The "inputs" and "targets" are concatenated to form the new targets. In
    addition, the binary mask to distinguish "inputs" and "targets" token are
    concatenated as well.

    We define inputs_width to be a width (or a number of tokens) of "inputs" in
    the concatenated sequence. This method computes the width corresponding with
    and without additional position. Both of these are necessary
    `_convert_example`.

    Args:
      ds: an input tf.data.Dataset to be converted.
      task_feature_lengths: a mapping from task feature name to its length.

    Returns:
      ds: the converted dataset.
    """
        def concat_and_add_masks(features):
            inputs = features["inputs"]
            targets = features["targets"]

            # Width of the "inputs" portion in the concatenated sequence.
            width = tf.size(inputs)
            inputs_width = tf.fill([tf.size(inputs) + tf.size(targets)], width)

            # Width with an extra position to the right in the inputs mask. See
            # docstring for details.
            inputs_width_add_pos = tf.fill(
                [tf.size(inputs) + tf.size(targets)], width + 1)

            return {
                "targets": tf.concat([inputs, targets], axis=-1),
                "inputs_width": inputs_width,
                "inputs_width_add_pos": inputs_width_add_pos
            }

        ds = ds.map(concat_and_add_masks,
                    num_parallel_calls=tf.data.experimental.AUTOTUNE)

        concat_length = sum(task_feature_lengths.values())
        concat_task_feature_lengths = {
            "targets": concat_length,
            "inputs_width": concat_length,
            "inputs_width_add_pos": concat_length
        }

        ds = self._pack_or_pad(ds, concat_task_feature_lengths)
        return ds.map(self._convert_example,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #4
0
 def maybe_cache(self, task: Task,
                 data: tf.data.Dataset) -> tf.data.Dataset:
     if self.cache_dir == 'MEMORY':
         return data.cache()
     elif self.cache_dir:
         cache_file = os.path.join(
             self.cache_dir, self.config.model_type,
             '%s.%s.cache' % (task.dataset, task.split))
         # If we have configuration details, they create intermediate directories that need to be created
         os.makedirs(os.path.join(cache_file, os.pardir), exist_ok=True)
         logging.debug('Caching tokenized data for %s to %s', task,
                       cache_file)
         return data.cache(cache_file)
     else:
         return data
コード例 #5
0
def append_eos(
    dataset: tf.data.Dataset,
    output_features: OutputFeaturesType,
) -> tf.data.Dataset:
    """Appends EOS to output feature token sequences with `add_eos` set to True.

  Respects the `add_eos` field of the seqio.Features in `output_features`.

  Args:
    dataset: a tf.data.Dataset of tokenized examples to preprocess.
    output_features: a mapping of output feature names to Feature objects.

  Returns:
    a tf.data.Dataset of tokenized examples with EOS added to specified output
    features.
  """
    def _maybe_add_eos(key: str, value: tf.Tensor) -> tf.Tensor:
        if key not in output_features or not output_features[key].add_eos:
            return value
        else:
            eos_id = output_features[key].vocabulary.eos_id
            return tf.concat([value, [eos_id]], axis=0)

    return dataset.map(
        lambda ex: {k: _maybe_add_eos(k, v)
                    for k, v in ex.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
  def _validate_dataset(self,
                        dataset: tf.data.Dataset,
                        expected_output_type: tf.DType,
                        expected_output_rank: int,
                        error_label: str,
                        ensure_no_eos: bool = False) -> tf.data.Dataset:
    """Validates properties of a tf.data.Dataset, raising Exceptions if needed.

    Args:
      dataset: a tf.data.Dataset to validate.
      expected_output_type: a tf.Dtype, the expected type of the model features.
      expected_output_rank: an int, the expected rank of the model features.
      error_label: a string, an identifier for the previous processing step to
        report in raised ValueErrors.
      ensure_no_eos: a bool, whether or not to verify that the model features
        contain no EOS tokens.

    Returns:
      a validated tf.data.Dataset.
    """
    element_spec = dataset.element_spec
    for feat in self.output_features:
      if feat not in element_spec:
        if self.output_features[feat].required:
          raise ValueError(
              "Task dataset is missing expected output feature after {label}: "
              "{feat}".format(label=error_label, feat=feat))
        else:
          # It's ok that this feature does not exist.
          continue
      if expected_output_type != element_spec[feat].dtype:
        raise ValueError(
            "Task dataset has incorrect type for feature '{feat}' after "
            "{label}: Got {actual}, expected {expected}".format(
                feat=feat, label=error_label,
                actual=element_spec[feat].dtype.name,
                expected=expected_output_type.name))
      if expected_output_rank != len(element_spec[feat].shape):
        raise ValueError(
            "Task dataset has incorrect rank for feature '{feat}' after "
            "{label}: Got {actual}, expected {expected}".format(
                feat=feat, label=error_label,
                actual=len(element_spec[feat].shape),
                expected=expected_output_rank))

    def _ensure_no_eos(feat, v):
      if feat not in self.output_features:
        return v
      with tf.control_dependencies([
          tf.debugging.assert_none_equal(
              v, tf.constant(1, tf.int64),
              message="Feature '{feat}' unexpectedly contains EOS=1 token "
              "after {label}.".format(feat=feat, label=error_label))
      ]):
        return v
    if ensure_no_eos:
      dataset = dataset.map(
          lambda ex: {k: _ensure_no_eos(k, v) for k, v in ex.items()},
          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset
コード例 #7
0
 def _batch(self, split: Split,
            dataset: tf.data.Dataset,
            drop_remainder: bool = True) -> tf.data.Dataset:
   """Get the batched version of `dataset`."""
   # `uneven_datasets` is a list of datasets with a number of validation and/or
   # test examples that is not evenly divisible by commonly used batch sizes.
   uneven_datasets = ['criteo', 'svhn']
   if self._is_training(split):
     batch_size = self.batch_size
   elif split == Split.VAL:
     batch_size = self.eval_batch_size
     if (self._num_validation_examples % batch_size != 0 and
         self.name not in uneven_datasets):
       logging.warn(
           'Batch size does not evenly divide the number of validation '
           'examples , cannot ensure static shapes on TPU. Batch size: %d, '
           'validation examples: %d',
           batch_size,
           self._num_validation_examples)
   else:
     batch_size = self.eval_batch_size
     if (self._num_test_examples % batch_size != 0 and
         self.name not in uneven_datasets):
       logging.warn(
           'Batch size does not evenly divide the number of test examples, '
           'cannot ensure static shapes on TPU. Batch size: %d, test '
           'examples: %d', batch_size, self._num_test_examples)
   # Note that we always drop the last batch when the batch size does not
   # evenly divide the number of examples.
   return dataset.batch(batch_size, drop_remainder=drop_remainder)
コード例 #8
0
def trim_and_pad_dataset(
        dataset: tf.data.Dataset,
        feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
    """Trim and pad first dimension of features to `feature_lengths`.

  Args:
    dataset: tf.data.Dataset, the dataset to trimp/pad examples in.
    feature_lengths: map from feature key to final length. Other features will
      be returned unchanged.
  Returns:
    Trimmed/padded tf.data.Dataset.
  """
    def _trim_and_pad(k: str, t: tf.Tensor) -> tf.Tensor:
        """Trim/pad to the first axis of `t` to be of size `length`."""
        if k not in feature_lengths:
            return t
        length_k = feature_lengths[k]
        t = t[:length_k]
        pad_amt = length_k - tf.shape(t)[0]
        padded_t = tf.pad(t, [(0, pad_amt)] + [(0, 0)] * (len(t.shape) - 1))
        padded_t.set_shape([length_k] + t.shape.as_list()[1:])
        return padded_t

    return dataset.map(
        lambda x: {k: _trim_and_pad(k, t)
                   for k, t in x.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #9
0
 def _convert_features(
         self, ds: tf.data.Dataset,
         task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
     """Convert the dataset to be fed to a language model."""
     ds = self._pack_or_pad(ds, task_feature_lengths)
     return ds.map(self._convert_example,
                   num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #10
0
def tokenize(dataset: tf.data.Dataset,
             output_features: OutputFeaturesType,
             copy_pretokenized: bool = True,
             with_eos: bool = False) -> tf.data.Dataset:
    """Encode output features with specified vocbularies.

  Passes through other features unchanged. Optionally passes through copy
  of original features with "_pretokenized" suffix added to the key.

  Args:
    dataset: a tf.data.Dataset of examples to tokenize.
    output_features: a dict of Feature objects; their vocabulary attribute will
      be used to tokenize the specified features.
    copy_pretokenized: bool, whether to pass through copies of original features
      with "_pretokenized" suffix added to the key.
    with_eos: bool, whether to append EOS to the end of the sequence.

  Returns:
    a tf.data.Dataset
  """
    def _tokenize(features):
        ret = {}
        for k, v in features.items():
            if k in output_features:
                if copy_pretokenized:
                    ret[f'{k}_pretokenized'] = v
                vocab = output_features[k].vocabulary
                v = vocab.encode_tf(v)
                if with_eos and output_features[k].add_eos:
                    v = tf.concat([v, [vocab.eos_id]], axis=-1)
            ret[k] = v
        return ret

    return dataset.map(_tokenize,
                       num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #11
0
def _check_lengths(ds: tf.data.Dataset, expected_lengths: Mapping[str, int],
                   strict: bool, error_label: str) -> tf.data.Dataset:
  """Check the length of each feature in `ds` against `expected_lengths`.

  There are two checking criteria controlled by `strict` arg.

  If strict = True,
  for each feature in ds, check len(feature) == expected_lengths[feature].

  If strict = False,
  for each feature in ds, check len(feature) <= expected_lengths[feature].

  Features of the input dataset may have [None] shape. The assertion is run at
  the graph execution time when the length is determined.

  Args:
    ds: a tf.data.Dataset to be checked.
    expected_lengths: a mapping from a feature name to an expected length.
    strict: if true, the length of each feature should exactly match the
      expected length whereas false condition allows the length to be less
      than or equal to the expected length.
    error_label: a label used to indicate the validation stage

  Returns:
    ds: the same dataset as but with the assertion ops attached.
  """

  def _check_length(feat, v):
    if feat not in expected_lengths:
      return v

    if strict:
      error_message = (
          f"Feature '{feat}' has length not equal to the expected length of "
          f"{expected_lengths[feat]} during {error_label} validation")
      assertion_op = functools.partial(
          tf.debugging.assert_equal, message=error_message)
    else:
      error_message = (
          f"Feature '{feat}' has length not less than or equal to the expected "
          f"length of {expected_lengths[feat]} during {error_label} validation")
      assertion_op = functools.partial(
          tf.debugging.assert_less_equal, message=error_message)

    expected_length = tf.constant(expected_lengths[feat], dtype=tf.int64)
    # Assumes that v has rank of 1.
    actual_length = tf.size(v, out_type=tf.int64)
    assertion_op(actual_length, expected_length)
    return v

  ds = ds.map(
      lambda ex: {k: _check_length(k, v) for k, v in ex.items()},
      num_parallel_calls=tf.data.experimental.AUTOTUNE)

  return ds
コード例 #12
0
ファイル: input_generator.py プロジェクト: tensorflow/lingvo
 def _pad_to_even_length(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
   p = self.params
   n = p.num_infeed_hosts
   if n <= 1:
     return dataset
   # pad with all paddings batch so that the total number of elements in
   # `dataset` can be evenly divided by n.
   if p.eval_data_size < 1:
     # dataset.cardinality() returns unknown, so we first materialize all
     # data.
     total_batches = len(list(dataset.as_numpy_iterator()))
   else:
     total_batches = (p.eval_data_size + p.batch_size - 1) // p.batch_size
   if total_batches % n == 0:
     return dataset
   per_host_batches = (total_batches + n - 1) // n
   num_pad_batches = per_host_batches * n - total_batches
   pad_batches = tf.data.Dataset.from_tensors(
       self._all_paddings_batch()).repeat(num_pad_batches)
   return dataset.concatenate(pad_batches)
コード例 #13
0
def _rename_plaintext_to_pretokenized(
    dataset: tf.data.Dataset) -> tf.data.Dataset:
  """Rename cached _plaintext features to new _pretokenized standard."""
  def _rename(inputs):
    outputs = {}
    for k, v in inputs.items():
      if k.endswith("_plaintext"):
        k = k[:-len("plaintext")] + "pretokenized"
      outputs[k] = v
    return outputs
  return dataset.map(
      _rename, num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #14
0
def get_batches(dataset: tf.data.Dataset,
                batch_size: int = 64) -> tf.data.Dataset:
    """Returns a Dataset that consists of padded batches when iterated over."""
    return dataset.padded_batch(batch_size,
                                padded_shapes={
                                    'idx': [],
                                    'sentence': [-1],
                                    'label': [1],
                                    'length': []
                                },
                                drop_remainder=False).prefetch(
                                    tf.data.experimental.AUTOTUNE)
コード例 #15
0
def show_examples(
    ds: tf.data.Dataset,
    ds_info: dataset_info.DatasetInfo,
    **options_kwargs: Any
):
  """Visualize images (and labels) from an image classification dataset.

  This function is for interactive use (Colab, Jupyter). It displays and return
  a plot of (rows*columns) images from a tf.data.Dataset.

  Usage:
  ```python
  ds, ds_info = tfds.load('cifar10', split='train', with_info=True)
  fig = tfds.show_examples(ds, ds_info)
  ```

  Args:
    ds: `tf.data.Dataset`. The tf.data.Dataset object to visualize. Examples
      should not be batched. Examples will be consumed in order until
      (rows * cols) are read or the dataset is consumed.
    ds_info: The dataset info object to which extract the label and features
      info. Available either through `tfds.load('mnist', with_info=True)` or
      `tfds.builder('mnist').info`
    **options_kwargs: Additional display options, specific to the dataset type
      to visualize. Are forwarded to `tfds.visualization.Visualizer.show`.
      See the `tfds.visualization` for a list of available visualizers.

  Returns:
    fig: The `matplotlib.Figure` object
  """
  if not isinstance(ds_info, dataset_info.DatasetInfo):  # Arguments inverted
    # `absl.logging` does not appear on Colab by default, so uses print instead.
    print('WARNING: For consistency with `tfds.load`, the `tfds.show_examples` '
          'signature has been modified from (info, ds) to (ds, info).\n'
          'The old signature is deprecated and will be removed. '
          'Please change your call to `tfds.show_examples(ds, info)`')
    ds, ds_info = ds_info, ds

  # Pack `as_supervised=True` datasets
  if (
      ds_info.supervised_keys
      and isinstance(ds.element_spec, tuple)
      and len(ds.element_spec) == 2
  ):
    x_key, y_key = ds_info.supervised_keys
    ds = ds.map(lambda x, y: {x_key: x, y_key: y})

  for visualizer in _ALL_VISUALIZERS:
    if visualizer.match(ds_info):
      return visualizer.show(ds, ds_info, **options_kwargs)
    raise ValueError(
        'Visualisation not supported for dataset `{}`'.format(ds_info.name)
    )
コード例 #16
0
ファイル: dataset_info.py プロジェクト: yamine15/datasets
def pack_as_supervised_ds(
    ds: tf.data.Dataset,
    ds_info: DatasetInfo,
) -> tf.data.Dataset:
    """Pack `(input, label)` dataset as `{'key0': input, 'key1': label}`."""
    if (ds_info.supervised_keys and isinstance(ds.element_spec, tuple)
            and len(ds.element_spec) == 2):
        x_key, y_key = ds_info.supervised_keys
        ds = ds.map(lambda x, y: {x_key: x, y_key: y})
        return ds
    else:  # If dataset isn't a supervised tuple (input, label), return as-is
        return ds
コード例 #17
0
    def _convert_features(
            self, ds: tf.data.Dataset,
            task_feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
        """Convert the dataset to be fed to the encoder-decoder model.

    The conversion process involves three steps

    1. Each feature in the `task_feature_lengths` is trimmed/padded and
       optionally packed depending on the value of self.pack.
    2. "inputs" fields are mapped to the encoder input and "targets" are mapped
       to decoder input (after being shifted) and target.

    All the keys in the `task_feature_lengths` should be present in the input
    dataset, which may contain some extra features that are not in the
    `task_feature_lengths`. They will not be included in the output dataset.
    One common scenario is the "inputs_pretokenized" and "targets_pretokenized"
    fields.

    Args:
      ds: an input tf.data.Dataset to be converted.
      task_feature_lengths: a mapping from feature to its length.

    Returns:
      ds: the converted dataset.
    """
        def convert_example(
                features: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
            # targets_segment_id is present only for a packed dataset.
            decoder_input_tokens = autoregressive_inputs(
                features["targets"],
                sequence_id=features.get("targets_segment_ids", None))

            d = {
                "encoder_input_tokens": features["inputs"],
                "decoder_target_tokens": features["targets"],
                "decoder_input_tokens": decoder_input_tokens,
                # Loss is computed for all but the padding positions.
                "decoder_loss_weights":
                non_padding_position(features["targets"])
            }

            if self.pack:
                d["encoder_segment_ids"] = features["inputs_segment_ids"]
                d["decoder_segment_ids"] = features["targets_segment_ids"]
                d["encoder_positions"] = features["inputs_positions"]
                d["decoder_positions"] = features["targets_positions"]

            return d

        ds = self._pack_or_pad(ds, task_feature_lengths)
        return ds.map(convert_example,
                      num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #18
0
def get_shuffled_batches(dataset: tf.data.Dataset,
                         seed: int = 0,
                         batch_size: int = 64) -> tf.data.Dataset:
    """Returns a Dataset that consists of padded batches when iterated over.

  This shuffles the examples randomly each epoch. The random order is
  deterministic and controlled by the seed.

  Batches are padded because sentences have different lengths.
  Sentences that are shorter in a batch will get 0s added at the end, until
  all sentences in the batch have the same length.

  Args:
    dataset: A TF Dataset with examples to be shuffled and batched.
    seed: The seed that determines the shuffling order, with a different order
      each epoch.
    batch_size: The size of each batch. The remainder is dropped.

  Returns:
    A TF Dataset containing padded batches.
  """
    # For shuffling we need to know how many training examples we have.
    num_examples = dataset.reduce(np.int64(0), lambda x, _: x + 1).numpy()

    # `padded_shapes` says what kind of shapes to expect: [] means a scalar, [-1]
    # means a vector of variable length, and [1] means a vector of size 1.
    return dataset.shuffle(num_examples,
                           seed=seed,
                           reshuffle_each_iteration=True).padded_batch(
                               batch_size,
                               padded_shapes={
                                   'idx': [],
                                   'sentence': [-1],
                                   'label': [1],
                                   'length': []
                               },
                               drop_remainder=True).prefetch(
                                   tf.data.experimental.AUTOTUNE)
コード例 #19
0
  def _trim_output_features(
      self,
      dataset: tf.data.Dataset,
      sequence_length: Optional[Mapping[str, int]]
    ) -> tf.data.Dataset:
    """Trim output features to sequence length."""
    def _trim(k: str, v: tf.Tensor) -> tf.Tensor:
      if k not in self.output_features or not sequence_length:
        return v
      return v[:sequence_length[k]]

    return dataset.map(
        lambda ex: {k: _trim(k, v) for k, v in ex.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #20
0
def write_predictions(model: LoudnessPredictor, data: tf.data.Dataset,
                      batch_size: int, save_directory: str, save_file: str):
    optimizer = tf.keras.optimizers.Adam(learning_rate=0.1)
    model.compile(optimizer=optimizer,
                  loss=multi_output_loss,
                  metrics=[MSEMetric(), AccuracyMetric()])
    logging.info('Compiled model')
    predictions = []
    total_error = 0
    total_baseline_error = 0
    num_predictions = 0
    with open(os.path.join(save_directory, save_file), "w") as infile:
        infile.write(
            "frequencies;spls;probefrequency;probelevel;probeloudness;targetmasking;predictedmasking\n"
        )
        for i, example in data.enumerate(start=0):
            input_example, target_phons, rest = example
            predicted_phons = model.predict(input_example)
            error = ((target_phons / 10) - (predicted_phons / 10))**2
            baseline_error = ((target_phons / 10) - (0 / 10))**2
            total_error += error[0][0]
            total_baseline_error += baseline_error[0][0]
            num_predictions += 1
            logging.info(
                'Prediction %d, actual masking %.4f, predicted masking %.4f',
                i,
                target_phons.numpy()[0][0], predicted_phons[0][0])
            infile.write(",".join(
                [str(f) for f in rest["frequencies"].numpy()[0]]))
            infile.write(";")
            infile.write(",".join([str(l) for l in rest["spls"].numpy()[0]]))
            infile.write(";")
            infile.write(str(rest["probefrequency"].numpy()[0][0]))
            infile.write(";")
            infile.write(str(rest["probelevel"].numpy()[0][0]))
            infile.write(";")
            infile.write(str(rest["probeloudness"].numpy()[0][0]))
            infile.write(";")
            infile.write(str(target_phons.numpy()[0][0]))
            infile.write(";")
            infile.write(str(predicted_phons[0][0]))
            infile.write("\n")
        infile.write("\n")
        infile.write("Baseline MSE: " +
                     str(total_baseline_error / num_predictions))
        infile.write("\n")
        infile.write("MSE: " + str(total_error / num_predictions))
    return
  def _trim_and_ensure_eos(
      self,
      dataset: tf.data.Dataset,
      sequence_length: Mapping[str, int]
    ) -> tf.data.Dataset:
    """Trim and append EOS=1 token to model features."""
    def _trim_and_append_eos(feat, v):
      if feat not in self.output_features:
        return v
      if sequence_length and self.output_features[feat].add_eos:
        v = tf.concat([v[:sequence_length[feat]-1], [1]], axis=0)
      elif sequence_length:
        v = v[:sequence_length[feat]]
      elif self.output_features[feat].add_eos:
        v = tf.concat([v, [1]], axis=0)
      return v

    return dataset.map(
        lambda ex: {k: _trim_and_append_eos(k, v) for k, v in ex.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #22
0
def append_eos_after_trim(
    dataset: tf.data.Dataset,
    sequence_length: Optional[SequenceLengthType],
    output_features: OutputFeaturesType,
) -> tf.data.Dataset:
    """Trims output feature token sequences and then appends EOS.

  Respects the `add_eos` field of the seqio.Features in `output_features`.
  Truncates features before adding the EOS to ensure they fit in the max length
  specified by `sequence_length` once the EOS is added. If `sequence_length` is
  None, no trimming is performed.

  Note that sequences are automatically trimmed at the end of the Task pipeline,
  so unless you want the features to always end in EOS, use `append_eos`
  instead.

  Args:
    dataset: a tf.data.Dataset of tokenized examples to preprocess.
    sequence_length: a mapping from output feature names to max lengths.
      If provided, output feature sequences will be trimmed to ensure they are
      not longer than this length once EOS is added.
    output_features: a mapping of output feature names to Feature objects.

  Returns:
    a tf.data.Dataset of tokenized examples with EOS added to specified output
    features.
  """
    def _maybe_add_eos_and_trim(key: str, value: tf.Tensor) -> tf.Tensor:
        if key not in output_features or not output_features[key].add_eos:
            return value
        eos_id = output_features[key].vocabulary.eos_id
        if sequence_length is not None:
            max_length = sequence_length[key]
            return tf.concat([value[:max_length - 1], [eos_id]], axis=0)
        else:
            return tf.concat([value, [eos_id]], axis=0)

    return dataset.map(
        lambda ex: {k: _maybe_add_eos_and_trim(k, v)
                    for k, v in ex.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #23
0
def _pack_with_tf_ops(dataset: tf.data.Dataset,
                      feature_lengths: Mapping[str, int]) -> tf.data.Dataset:
    """Helper-function for packing a dataset which has already been batched.

  See trim_and_pack_dataset()

  Uses tf.while_loop. Slow.

  Args:
    dataset: a dataset containing padded batches of examples.
    feature_lengths: mapping from feature key to packed length.

  Returns:
    a dataset.
  """
    empty_example = {}
    for k in feature_lengths:
        for suff in ("", "_positions"):
            empty_example[k + suff] = tf.zeros([0], dtype=tf.int32)
            empty_example[k + suff].set_shape([None])
    keys_etc = empty_example.keys()

    def _write_packed_example(partial, outputs):
        new_partial = empty_example.copy()
        new_outputs = {}
        for k in keys_etc:
            new_outputs[k] = outputs[k].write(
                outputs[k].size(),
                tf.pad(partial[k], [[
                    0, feature_lengths[_strip_packed_feature_key(k)] -
                    tf.size(partial[k])
                ]]))
        return new_partial, new_outputs

    def pack_batch(x: Mapping[str, tf.Tensor]) -> Mapping[str, tf.Tensor]:
        """Internal function to map over.

    Consumes a batch of input examples and produces a variable number of output
    examples.

    Args:
      x: a single example
    Returns:
      a tf.data.Dataset
    """
        keys = list(feature_lengths)
        partial = empty_example.copy()
        first_key, *_ = keys
        dynamic_batch_size = tf.shape(x[first_key])[0]
        outputs = {}
        for k in keys:
            outputs[k] = tf.TensorArray(tf.int32,
                                        size=0,
                                        dynamic_size=True,
                                        element_shape=[feature_lengths[k]])
            outputs[k + "_positions"] = tf.TensorArray(
                tf.int32,
                size=0,
                dynamic_size=True,
                element_shape=[feature_lengths[k]])

        for i in tf.range(0, dynamic_batch_size):
            tf.autograph.experimental.set_loop_options(shape_invariants=[(
                partial, {k: tf.TensorShape([None])
                          for k in keys_etc}
            ), (outputs, {k: tf.TensorShape(None)
                          for k in keys_etc})])

            can_append = True
            one_example = {}
            for k in keys:
                val = tf.cast(x[k][i], tf.int32)
                val = val[:tf.
                          reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))]
                one_example[k] = val
            for k in keys:
                can_append = tf.logical_and(
                    can_append,
                    tf.less_equal(
                        tf.size(partial[k]) + tf.size(one_example[k]),
                        feature_lengths[k]))

            if not can_append:
                partial, outputs = _write_packed_example(partial, outputs)

            new_partial = {}
            for k in keys:
                new_seq = one_example[k][:feature_lengths[k]]
                new_seq_len = tf.size(new_seq)
                new_partial[k] = tf.concat([partial[k], new_seq], 0)
                new_partial[k + "_positions"] = tf.concat([
                    partial[k + "_positions"],
                    tf.range(new_seq_len, dtype=tf.int32)
                ], 0)
            partial = new_partial

        partial, outputs = _write_packed_example(partial, outputs)
        packed = {k: outputs[k].stack() for k in keys_etc}
        for k in keys:
            packed[k + "_segment_ids"] = (tf.cumsum(
                tf.cast(tf.equal(packed[k + "_positions"], 0), tf.int32),
                axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32))
        return packed

    dataset = dataset.map(pack_batch,
                          num_parallel_calls=tf.data.experimental.AUTOTUNE)
    return dataset.unbatch()
コード例 #24
0
def trim_and_pack_dataset(dataset: tf.data.Dataset,
                          feature_lengths: Mapping[str, int],
                          use_custom_ops: bool = False) -> tf.data.Dataset:
    """Creates a 'packed' version of a dataset on-the-fly.

  Modified from the tensor2tensor library.

  This is meant to replace the irritation of having to create a separate
  "packed" version of a dataset to train efficiently on TPU.

  Each example in the output dataset represents several examples in the
  input dataset.

  For each key in the input dataset that also exists in `feature_lengths`, two
  additional keys are created:
    <key>_segment_ids: an int32 tensor identifying the parts
       representing the original example.
    <key>_positions: an int32 tensor identifying the position within the
       original example.

  Features that are not in `feature_lengths` will be removed.

  Example:
    Two input examples get combined to form an output example.
    The input examples are:
    {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0], "idx": 0}
    {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1], "idx": 1}
    The output example is:
    {
                   "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0]
       "inputs_segment_ids": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0]
         "inputs_positions": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0]
                  "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0]
      "targets_segment_ids": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0]
        "targets_positions": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0]
    }

    0 represents padding in both the inputs and the outputs.

    Sequences in the incoming examples are truncated to length in
    `feature_lengths`, and the sequences in the output examples all have this
    fixed (padded) length. Features not in `features_length` (i.e, "idx") are
    removed.

  Args:
    dataset: a tf.data.Dataset
    feature_lengths: map from feature key to final length. Other features will
      be discarded.
    use_custom_ops: a boolean - custom ops are faster but require a custom-built
      binary, which is not currently possible on cloud-tpu.

  Returns:
    a tf.data.Dataset
  """
    element_spec = dataset.element_spec
    # Make sure that the dataset contains all keys in `feature_lengths`.
    for k in feature_lengths:
        if k not in element_spec:
            raise ValueError(
                f"Feature '{k}' not found in dataset. Available keys are "
                f"{list(element_spec.keys())}")
        if not element_spec[k].shape.is_compatible_with(tf.TensorShape([None
                                                                        ])):
            raise ValueError(
                f"Features to be packed must be one-dimensional. '{k}' is not.'"
            )
    # Warn if there are any additional keys that will be removed.
    additional_keys = set(element_spec) - set(feature_lengths)
    if additional_keys:
        logging.warn(
            "Features not in `features_length` will be removed during packing: %s",
            additional_keys)

    ds = dataset.map(
        lambda x: {k: x[k][:l]
                   for k, l in feature_lengths.items()},
        num_parallel_calls=tf.data.experimental.AUTOTUNE)

    # Setting batch_size=length ensures that the concatenated sequences (if they
    # have length >=1) are sufficient to fill at least one packed example.
    batch_size = max(feature_lengths.values())
    ds = ds.padded_batch(batch_size,
                         padded_shapes={k: [-1]
                                        for k in feature_lengths})

    if use_custom_ops and len(feature_lengths) <= 2:
        ds = _pack_with_custom_ops(ds, feature_lengths)
    else:
        ds = _pack_with_tf_ops(ds, feature_lengths)

    # Set the Tensor shapes correctly since they get lost in the process.
    def _set_shape(x):
        for k, v in x.items():
            v.set_shape([feature_lengths[_strip_packed_feature_key(k)]])
        return x

    return ds.map(_set_shape, num_parallel_calls=tf.data.experimental.AUTOTUNE)
コード例 #25
0
ファイル: dataset_factory.py プロジェクト: ztpxy1997/models
    def pipeline(
            self,
            dataset: tf.data.Dataset,
            input_context: tf.distribute.InputContext = None
    ) -> tf.data.Dataset:
        """Build a pipeline fetching, shuffling, and preprocessing the dataset.

    Args:
      dataset: A `tf.data.Dataset` that loads raw files.
      input_context: An optional context provided by `tf.distribute` for
        cross-replica training. This isn't necessary if using Keras
        compile/fit.

    Returns:
      A TensorFlow dataset outputting batched images and labels.
    """
        if input_context and input_context.num_input_pipelines > 1:
            dataset = dataset.shard(input_context.num_input_pipelines,
                                    input_context.input_pipeline_id)

        if self.is_training and not self.config.cache:
            dataset = dataset.repeat()

        if self.config.builder == 'records':
            # Read the data from disk in parallel
            buffer_size = 8 * 1024 * 1024  # Use 8 MiB per file
            dataset = dataset.interleave(
                lambda name: tf.data.TFRecordDataset(name,
                                                     buffer_size=buffer_size),
                cycle_length=16,
                num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.prefetch(self.global_batch_size)

        if self.config.cache:
            dataset = dataset.cache()

        if self.is_training:
            dataset = dataset.shuffle(self.config.shuffle_buffer_size)
            dataset = dataset.repeat()

        # Parse, pre-process, and batch the data in parallel
        if self.config.builder == 'records':
            preprocess = self.parse_record
        else:
            preprocess = self.preprocess
        dataset = dataset.map(preprocess,
                              num_parallel_calls=tf.data.experimental.AUTOTUNE)

        dataset = dataset.batch(self.batch_size,
                                drop_remainder=self.is_training)

        # Note: we could do image normalization here, but we defer it to the model
        # which can perform it much faster on a GPU/TPU
        # TODO(dankondratyuk): if we fix prefetching, we can do it here

        if self.is_training and self.config.deterministic_train is not None:
            options = tf.data.Options()
            options.experimental_deterministic = self.config.deterministic_train
            options.experimental_slack = self.config.use_slack
            options.experimental_optimization.parallel_batch = True
            options.experimental_optimization.map_fusion = True
            options.experimental_optimization.map_vectorization.enabled = True
            options.experimental_optimization.map_parallelization = True
            dataset = dataset.with_options(options)

        # Prefetch overlaps in-feed with training
        # Note: autotune here is not recommended, as this can lead to memory leaks.
        # Instead, use a constant prefetch size like the the number of devices.
        dataset = dataset.prefetch(self.config.num_devices)

        return dataset
コード例 #26
0
def _pack_with_tf_ops(dataset: tf.data.Dataset, keys: List[str],
                      key2length: Dict[str, int]) -> tf.data.Dataset:
    """Helper-function for packing a dataset which has already been batched.

  Helper for pack_dataset()  Uses tf.while_loop.

  Args:
    dataset: a dataset containing padded batches of examples.
    keys: a list of strings
    key2length: an dict from feature-key to integer

  Returns:
    a dataset.
  """
    empty_example = {}
    for k in keys:
        empty_example[k] = tf.zeros([0], dtype=tf.int32)
        empty_example[k + '_position'] = tf.zeros([0], dtype=tf.int32)
    keys_etc = empty_example.keys()

    def write_packed_example(partial, outputs):
        new_partial = empty_example.copy()
        new_outputs = {}
        for k in keys_etc:
            new_outputs[k] = outputs[k].write(
                outputs[k].size(),
                tf.pad(partial[k], [[0, key2length[k] - tf.size(partial[k])]]))
        return new_partial, new_outputs

    def map_fn(x):
        """Internal function to flat_map over.

    Consumes a batch of input examples and produces a variable number of output
    examples.
    Args:
      x: a single example

    Returns:
      a tf.data.Dataset
    """
        partial = empty_example.copy()
        i = tf.zeros([], dtype=tf.int32)
        dynamic_batch_size = tf.shape(x[keys[0]])[0]
        outputs = {}
        for k in keys:
            outputs[k] = tf.TensorArray(tf.int32,
                                        size=0,
                                        dynamic_size=True,
                                        element_shape=[key2length[k]])
            outputs[k + '_position'] = tf.TensorArray(
                tf.int32,
                size=0,
                dynamic_size=True,
                element_shape=[key2length[k]])

        def body_fn(i, partial, outputs):
            """Body function for while_loop.

      Args:
        i: integer scalar
        partial: dictionary of Tensor (partially-constructed example)
        outputs: dictionary of TensorArray

      Returns:
        A triple containing the new values of the inputs.
      """
            can_append = True
            one_example = {}
            for k in keys:
                val = tf.cast(x[k][i], tf.int32)
                val = val[:tf.
                          reduce_sum(tf.cast(tf.not_equal(val, 0), tf.int32))]
                one_example[k] = val
            for k in keys:
                can_append = tf.logical_and(
                    can_append,
                    tf.less_equal(
                        tf.size(partial[k]) + tf.size(one_example[k]),
                        key2length[k]))

            def false_fn():
                return write_packed_example(partial, outputs)

            def true_fn():
                return partial, outputs

            partial, outputs = tf.cond(can_append, true_fn, false_fn)
            new_partial = {}
            for k in keys:
                new_seq = one_example[k][:key2length[k]]
                new_seq_len = tf.size(new_seq)
                new_partial[k] = tf.concat([partial[k], new_seq], 0)
                new_partial[k + '_position'] = tf.concat(
                    [partial[k + '_position'],
                     tf.range(new_seq_len)], 0)
            partial = new_partial
            return i + 1, partial, outputs

        # For loop over all examples in the batch.
        i, partial, outputs = tf.while_loop(
            cond=lambda *_: True,
            body=body_fn,
            loop_vars=(i, partial, outputs),
            shape_invariants=(
                tf.TensorShape([]),
                {k: tf.TensorShape([None])
                 for k in keys_etc},
                {k: tf.TensorShape(None)
                 for k in keys_etc},
            ),
            maximum_iterations=dynamic_batch_size)
        _, outputs = write_packed_example(partial, outputs)
        packed = {k: outputs[k].stack() for k in keys_etc}
        for k in keys:
            packed[k + '_segmentation'] = (tf.cumsum(
                tf.cast(tf.equal(packed[k + '_position'], 0), tf.int32),
                axis=1) * tf.cast(tf.not_equal(packed[k], 0), tf.int32))
        return packed

    dataset = dataset.map(map_fn, num_parallel_calls=AUTOTUNE)
    return dataset.unbatch()
コード例 #27
0
def pack_dataset(dataset: tf.data.Dataset,
                 key2length: Union[int, Dict[str, int]],
                 keys: Optional[List[str]] = None) -> tf.data.Dataset:
    """Creates a 'packed' version of a dataset on-the-fly.

  Adapted from the mesh-tf implementation.

  This is meant to replace the irritation of having to create a separate
  "packed" version of a dataset to train efficiently on TPU.
  Each example in the output dataset represents several examples in the
  input dataset.
  For each key in the input dataset, two additional keys are created:
  <key>_segmentation: an int32 tensor identifying the parts
     representing the original example.
  <key>_position: an int32 tensor identifying the position within the original
     example.
  Example:
  Two input examples get combined to form an output example.
  The input examples are:
  {"inputs": [8, 7, 1, 0], "targets":[4, 1, 0]}
  {"inputs": [2, 3, 4, 1], "targets":[5, 6, 1]}
  The output example is:
  {
                 "inputs": [8, 7, 1, 2, 3, 4, 1, 0, 0, 0]
    "inputs_segmentation": [1, 1, 1, 2, 2, 2, 2, 0, 0, 0]
        "inputs_position": [0, 1, 2, 0, 1, 2, 3, 0, 0, 0]
                "targets": [4, 1, 5, 6, 1, 0, 0, 0, 0, 0]
   "targets_segmentation": [1, 1, 2, 2, 2, 0, 0, 0, 0, 0]
       "targets_position": [0, 1, 0, 1, 2, 0, 0, 0, 0, 0]
  }
  0 represents padding in both the inputs and the outputs.
  Sequences in the incoming examples are truncated to length "length", and the
  sequences in the output examples all have fixed (padded) length "length".

  Args:
    dataset: a tf.data.Dataset
    key2length: an integer, or a dict from feature-key to integer
    keys: a list of strings (e.g. ["inputs", "targets"])

  Returns:
    a tf.data.Dataset
  """
    shapes = tf.nest.map_structure(lambda spec: spec.shape,
                                   dataset.element_spec)
    if keys is None:
        keys = list(shapes.keys())
    for k in keys:
        if k not in shapes:
            raise ValueError(
                'Key %s not found in dataset.  Available keys are %s' %
                (k, shapes.keys()))
        if not shapes[k].is_compatible_with(tf.TensorShape([None])):
            raise ValueError('Tensors to be packed must be one-dimensional.')
    # make sure that the length dictionary contains all keys as well as the
    # keys suffixed by "_segmentation" and "_position"
    if isinstance(key2length, int):
        key2length = {k: key2length for k in keys}
    for k in keys:
        for suffix in ['_segmentation', '_position']:
            key2length[k + suffix] = key2length[k]

    # trim to length
    dataset = dataset.map(lambda x: {k: x[k][:key2length[k]]
                                     for k in keys},
                          num_parallel_calls=AUTOTUNE)
    # Setting batch_size=length ensures that the concatenated sequences (if they
    # have length >=1) are sufficient to fill at least one packed example.
    batch_size = max(key2length.values())
    dataset = dataset.padded_batch(batch_size,
                                   padded_shapes={k: [-1]
                                                  for k in keys})
    dataset = _pack_with_tf_ops(dataset, keys, key2length)

    # Set the Tensor shapes correctly since they get lost in the process.
    def my_fn(x):
        return {k: tf.reshape(v, [key2length[k]]) for k, v in x.items()}

    return dataset.map(my_fn, num_parallel_calls=AUTOTUNE)
コード例 #28
0
 def _cast_output_features(self, dataset: tf.data.Dataset) -> tf.data.Dataset:
   """Cast output features to the specified dtypes, leaving others as-is."""
   dtypes = {k: f.dtype for k, f in self.output_features.items()}
   return dataset.map(
       lambda x: {k: tf.cast(v, dtypes.get(k, v.dtype)) for k, v in x.items()},
       num_parallel_calls=tf.data.experimental.AUTOTUNE)