Ejemplo n.º 1
0
def broadcast_sample_weight_modes(target_structure, sample_weight_modes):
  """Match sample_weight_modes structure with output structure."""
  if target_structure is None or not nest.flatten(target_structure):
    return sample_weight_modes

  if isinstance(sample_weight_modes, str):
    if isinstance(target_structure, dict):
      return {key: sample_weight_modes for key in target_structure.keys()}
    return [sample_weight_modes for _ in target_structure]

  if sample_weight_modes:
    try:
      nest.assert_same_structure(
          training_utils.list_to_tuple(target_structure),
          training_utils.list_to_tuple(sample_weight_modes))
    except (ValueError, TypeError):
      target_str = str(nest.map_structure(lambda _: "...", target_structure))
      mode_str = str(nest.map_structure(lambda _: "...", sample_weight_modes))

      # Attempt to coerce sample_weight_modes to the target structure. This
      # implicitly depends on the fact that Model flattens outputs for its
      # internal representation.
      try:
        sample_weight_modes = nest.pack_sequence_as(
            target_structure, nest.flatten(sample_weight_modes))
        logging.warning(
            "sample_weight modes were coerced from\n  {}\n    to  \n  {}"
            .format(target_str, mode_str))
      except (ValueError, TypeError):
        raise ValueError(
            "Unable to match target structure and sample_weight_modes "
            "structure:\n  {}\n    to  \n  {}".format(target_str, mode_str))

  return sample_weight_modes
Ejemplo n.º 2
0
def _process_numpy_inputs(inputs):
    """Process numpy array inputs.

  For numpy inputs, it is possible to be single numpy array, or list/dict of
  them. They could also be preprocessed by other lib to match with the order
  of position for the model. The result here should be something that can be
  used to build dataset.

  Args:
    inputs: single or list/tuple/dict of numpy array.
  Returns:
    numpy arrays can be used to build dataset.
  """
    if is_none_or_empty(inputs):
        return None
    flat_inputs = nest.flatten(inputs)
    if len(flat_inputs) == 1:
        return flat_inputs[0]

    def _convert_non_tensor(x):
        # Don't call `ops.convert_to_tensor` on all `inputs` because
        # `SparseTensors` can't be converted to `Tensor`.
        if isinstance(x, np.ndarray):
            return ops.convert_to_tensor(x)
        return x

    inputs = nest.map_structure(_convert_non_tensor, inputs)
    # For more complicated structure, we only convert the out most list to tuple
    # since dataset will stack the list, but treat elements in the tuple as
    # individual element.
    return training_utils.list_to_tuple(inputs)
Ejemplo n.º 3
0
    def __init__(self,
                 x,
                 y=None,
                 sample_weights=None,
                 batch_size=None,
                 steps=None,
                 shuffle=False,
                 **kwargs):
        super(CompositeTensorDataAdapter, self).__init__(x, y, **kwargs)
        x = _process_numpy_inputs(x)
        y = _process_numpy_inputs(y)
        sample_weights = _process_numpy_inputs(sample_weights)

        # If sample_weights are not specified for an output use 1.0 as weights.
        if (sample_weights is not None
                and any([sw is None for sw in sample_weights])):
            weight = next(s for s in sample_weights if s is not None)
            sample_weights = training_utils.list_to_tuple([
                array_ops.ones((weight.shape[0], )) if sw is None else sw
                for sw in sample_weights
            ])

        if y is not None and sample_weights is not None:
            inputs = (x, y, sample_weights)
        elif y is not None:
            # Sample weight is only needed for training, so if y is None, then
            # sample_weight is ignored.
            inputs = (x, y)
        else:
            inputs = (x, )

        dataset = dataset_ops.DatasetV2.from_tensor_slices(inputs)
        num_samples = int(nest.flatten(x)[0].shape[0])
        if shuffle:
            dataset = dataset.shuffle(num_samples)

        # If batch_size is not passed but steps is, calculate from the input data.
        if steps and not batch_size:
            batch_size = int(math.ceil(num_samples / steps))

        if not batch_size:
            raise ValueError(
                "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
                " input data.")

        dataset = dataset.batch(batch_size)
        self._size = int(math.ceil(num_samples / batch_size))
        self._batch_size = batch_size
        self._has_partial_batch = (self._size != (num_samples // batch_size))

        self._partial_batch_size = None
        if self._has_partial_batch:
            self._partial_batch_size = (num_samples -
                                        (self._size - 1) * self._batch_size)

        self._dataset = dataset
Ejemplo n.º 4
0
def _process_numpy_inputs(inputs):
    """Process numpy array inputs.

  For numpy inputs, it is possible to be single numpy array, or list/dict of
  them. They could also be preprocessed by other lib to match with the order
  of position for the model. The result here should be something that can be
  used to build dataset.

  Args:
    inputs: single or list/tuple/dict of numpy array.
  Returns:
    numpy arrays can be used to build dataset.
  """
    if is_none_or_empty(inputs):
        return None
    flat_inputs = nest.flatten(inputs)
    if len(flat_inputs) == 1:
        return flat_inputs[0]
    # For more complicated structure, we only convert the out most list to tuple
    # since dataset will stack the list, but treat elements in the tuple as
    # individual element.
    return training_utils.list_to_tuple(inputs)
Ejemplo n.º 5
0
def handle_partial_sample_weights(outputs, sample_weights,
                                  sample_weight_modes):
    """Adds 1.0 as sample weights for the outputs for which there is no weight.

  Args:
    outputs: List of model outputs.
    sample_weights: List of sample weight inputs.
    sample_weight_modes: List of sample weight modes or None.

  Returns:
    Tuple of sample weights, one sample weight for every output.
  """
    new_sample_weights = []
    for i, sw in enumerate(sample_weights):
        if sw is None:
            output_shape = outputs[i].shape
            is_temporal = (sample_weight_modes is not None
                           and sample_weight_modes[i] == "temporal")
            sw_shape = (output_shape[0], output_shape[1]) if is_temporal else (
                output_shape[0], )
            new_sample_weights.append(array_ops.ones(sw_shape))
        else:
            new_sample_weights.append(sw)
    return training_utils.list_to_tuple(new_sample_weights)
Ejemplo n.º 6
0
    def __init__(self,
                 x,
                 y=None,
                 sample_weights=None,
                 batch_size=None,
                 epochs=1,
                 steps=None,
                 shuffle=False,
                 **kwargs):
        super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs)
        x = _process_numpy_inputs(x)
        y = _process_numpy_inputs(y)
        sample_weights = _process_numpy_inputs(sample_weights)

        # If sample_weights are not specified for an output use 1.0 as weights.
        if sample_weights is not None and any(w is None
                                              for w in sample_weights):
            weight = next(s for s in sample_weights if s is not None)
            sample_weights = training_utils.list_to_tuple([
                array_ops.ones((weight.shape[0], )) if sw is None else sw
                for sw in sample_weights
            ])

        if y is not None and sample_weights is not None:
            inputs = (x, y, sample_weights)
        elif y is not None:
            # Sample weight is only needed for training, so if y is None, then
            # sample_weight is ignored.
            inputs = (x, y)
        else:
            inputs = (x, )

        num_samples = set(int(i.shape[0]) for i in nest.flatten(inputs))
        if len(num_samples) > 1:
            msg = "Data cardinality is ambiguous:\n"
            for label, data in zip(["x", "y", "sample_weight"], inputs):
                msg += "  {} sizes: {}\n".format(
                    label,
                    ", ".join([str(i.shape[0]) for i in nest.flatten(data)]))
            msg += "Please provide data which shares the same first dimension."
            raise ValueError(msg)
        num_samples = num_samples.pop()

        # If batch_size is not passed but steps is, calculate from the input data.
        if steps and not batch_size:
            batch_size = int(math.ceil(num_samples / steps))

        if not batch_size:
            raise ValueError(
                "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
                " input data.")

        self._size = int(math.ceil(num_samples / batch_size))
        self._batch_size = batch_size

        num_full_batches = int(num_samples // batch_size)
        self._partial_batch_size = num_samples % batch_size

        # Vectorized version of shuffle.
        # This is a performance improvement over using `from_tensor_slices`.
        # The indices of the data are shuffled and batched, and these indices
        # are then zipped with the data and used to extract a batch of the data
        # at each step. The performance improvements here come from:
        # 1. vectorized batch using gather
        # 2. parallelized map
        # 3. pipelined permutation generation
        # 4. optimized permutation batching
        # 5. disabled static optimizations

        indices_dataset = dataset_ops.DatasetV2.range(1).repeat()

        def permutation(_):
            # It turns out to be more performant to make a new set of indices rather
            # than reusing the same range Tensor. (presumably because of buffer
            # forwarding.)
            indices = math_ops.range(num_samples, dtype=dtypes.int64)
            if shuffle:
                indices = random_ops.random_shuffle(indices)
            return indices

        # We prefetch a single element. Computing large permutations can take quite
        # a while so we don't want to wait for prefetching over an epoch boundary to
        # trigger the next permutation. On the other hand, too many simultaneous
        # shuffles can contend on a hardware level and degrade all performance.
        indices_dataset = indices_dataset.map(permutation).prefetch(1)

        def slice_batch_indices(indices):
            """Convert a Tensor of indices into a dataset of batched indices.

      This step can be accomplished in several ways. The most natural is to
      slice the Tensor in a Dataset map. (With a condition on the upper index to
      handle the partial batch.) However it turns out that coercing the Tensor
      into a shape which is divisible by the batch size (and handling the last
      partial batch separately) allows for a much more favorable memory access
      pattern and improved performance.

      Args:
        indices: Tensor which determines the data order for an entire epoch.

      Returns:
        A Dataset of batched indices.
      """
            num_in_full_batch = num_full_batches * batch_size
            first_k_indices = array_ops.slice(indices, [0],
                                              [num_in_full_batch])
            first_k_indices = array_ops.reshape(first_k_indices,
                                                [num_full_batches, batch_size])

            flat_dataset = dataset_ops.DatasetV2.from_tensor_slices(
                first_k_indices)
            if self._partial_batch_size:
                index_remainder = dataset_ops.DatasetV2.from_tensors(
                    array_ops.slice(indices, [num_in_full_batch],
                                    [self._partial_batch_size]))
                flat_dataset = flat_dataset.concatenate(index_remainder)
            return flat_dataset

        indices_dataset = indices_dataset.flat_map(slice_batch_indices)
        dataset = dataset_ops.DatasetV2.zip(
            (indices_dataset,
             dataset_ops.DatasetV2.from_tensors(inputs).repeat()))

        def grab_batch(i, data):
            return nest.map_structure(lambda d: array_ops.gather(d, i, axis=0),
                                      data)

        dataset = dataset.map(grab_batch,
                              num_parallel_calls=dataset_ops.AUTOTUNE)

        # Default optimizations are disabled to avoid the overhead of (unnecessary)
        # input pipeline graph serialization and deserialization
        options = dataset_ops.Options()
        options.experimental_optimization.apply_default_optimizations = False
        dataset = dataset.with_options(options)
        self._dataset = dataset
Ejemplo n.º 7
0
    def __init__(self,
                 x,
                 y=None,
                 sample_weights=None,
                 batch_size=None,
                 epochs=1,
                 steps=None,
                 shuffle=False,
                 **kwargs):
        super(TensorLikeDataAdapter, self).__init__(x, y, **kwargs)
        x = _process_numpy_inputs(x)
        y = _process_numpy_inputs(y)
        sample_weights = _process_numpy_inputs(sample_weights)

        # If sample_weights are not specified for an output use 1.0 as weights.
        if sample_weights is not None and any(w is None
                                              for w in sample_weights):
            weight = next(s for s in sample_weights if s is not None)
            sample_weights = training_utils.list_to_tuple([
                array_ops.ones((weight.shape[0], )) if sw is None else sw
                for sw in sample_weights
            ])

        if y is not None and sample_weights is not None:
            inputs = (x, y, sample_weights)
        elif y is not None:
            # Sample weight is only needed for training, so if y is None, then
            # sample_weight is ignored.
            inputs = (x, y)
        else:
            inputs = (x, )

        num_samples = int(nest.flatten(x)[0].shape[0])

        # If batch_size is not passed but steps is, calculate from the input data.
        if steps and not batch_size:
            batch_size = int(math.ceil(num_samples / steps))

        if not batch_size:
            raise ValueError(
                "`batch_size` or `steps` is required for `Tensor` or `NumPy`"
                " input data.")

        self._size = int(math.ceil(num_samples / batch_size))
        self._batch_size = batch_size
        self._has_partial_batch = (self._size != (num_samples // batch_size))

        self._partial_batch_size = None
        if self._has_partial_batch:
            self._partial_batch_size = (num_samples -
                                        (self._size - 1) * self._batch_size)

        # Vectorized version of shuffle.
        # This is a performance improvement over using `from_tensor_slices`.
        # The indices of the data are shuffled and batched, and these indices
        # are then zipped with the data and used to extract a batch of the data
        # at each step. The performance improvements here come from:
        # 1. vectorized batch using gather
        # 2. parallelized map
        # 3. vectorized shuffle by using reshape and unbatch
        # 4. disabled static optimizations
        indices_ds = None
        for _ in range(epochs):
            indices = np.arange(num_samples)
            if shuffle:
                np.random.shuffle(indices)

            full_batch_indices = np.reshape(
                indices[:(num_samples // batch_size) * batch_size],
                [-1, batch_size])
            partial_batch_indices = indices[(num_samples // batch_size) *
                                            batch_size:]

            epoch_indices_ds = dataset_ops.DatasetV2.from_tensors(
                full_batch_indices).unbatch()
            if partial_batch_indices.size:
                epoch_indices_ds = epoch_indices_ds.concatenate(
                    dataset_ops.DatasetV2.from_tensors(partial_batch_indices))

            if indices_ds is None:
                indices_ds = epoch_indices_ds
            else:
                indices_ds = indices_ds.concatenate(epoch_indices_ds)

        data_ds = dataset_ops.DatasetV2.from_tensors(inputs).repeat()
        dataset = dataset_ops.DatasetV2.zip((data_ds, indices_ds))

        def _nested_grab_batch(data, indices):
            """Grabs batches of Tensors in `data` based on `indices`."""
            def _grab_batch(x):
                """Grabs a batch of `x`."""
                x_batch = array_ops.gather(x, indices)
                x_shape = x.shape.as_list()

                if not self._has_partial_batch:
                    # Recover the batch shape info.
                    x_shape[0] = self._batch_size
                    x_batch.set_shape(x_shape)
                elif self._partial_batch_size >= num_samples:
                    # Only one batch per epoch.
                    x_shape[0] = self._partial_batch_size
                    x_batch.set_shape(x_shape)
                return x_batch

            return nest.map_structure(_grab_batch, data)

        dataset = dataset.map(_nested_grab_batch,
                              num_parallel_calls=dataset_ops.AUTOTUNE)

        # Default optimizations are disabled to avoid the overhead of (unnecessary)
        # input pipeline graph serialization and deserialization
        options = dataset_ops.Options()
        options.experimental_optimization.apply_default_optimizations = False
        dataset = dataset.with_options(options)
        self._dataset = dataset