Beispiel #1
0
def _TopKGrad(op, grad, _):
  """Return the gradients for TopK.

  Args:
    op: The TopKOp for which we need to generate gradients.
    grad: Tensor. The gradients passed to the TopKOp.

  Returns:
    A list of two tensors, the first being the gradient w.r.t to the input and
    TopK, and the second being the gradient w.r.t. to the indices (all zero).
  """
  in_shape = array_ops.shape(op.inputs[0])
  ind_shape = array_ops.shape(op.outputs[1])

  ind_lastdim = array_ops.gather(ind_shape, array_ops.size(ind_shape) - 1)
  # Flatten indices to 2D.
  ind_2d = array_ops.reshape(op.outputs[1], array_ops.stack([-1, ind_lastdim]))

  in_lastdim = array_ops.gather(in_shape, array_ops.size(in_shape) - 1)
  outerdim = array_ops.shape(ind_2d)[0]
  # Compute linear indices (flattened to 1D).
  ind = array_ops.reshape(ind_2d + array_ops.expand_dims(
      math_ops.range(0, outerdim * in_lastdim, in_lastdim), -1), [-1])

  # Substitute grad to appropriate locations and fill the rest with zeros,
  # finally reshaping it to the original input shape.
  return [array_ops.reshape(
      sparse_ops.sparse_to_dense(ind,
                                 array_ops.reshape(
                                     math_ops.reduce_prod(in_shape), [1]),
                                 array_ops.reshape(grad, [-1]),
                                 validate_indices=False),
      in_shape), array_ops.zeros(
          [], dtype=dtypes.int32)]
Beispiel #2
0
  def _fused_batch_norm(self, inputs, training):
    """Returns the output of fused batch norm."""
    beta = self.beta if self.center else self._beta_const
    gamma = self.gamma if self.scale else self._gamma_const

    def _fused_batch_norm_training():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          epsilon=self.epsilon,
          data_format=self._data_format)

    def _fused_batch_norm_inference():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          mean=self.moving_mean,
          variance=self.moving_variance,
          epsilon=self.epsilon,
          is_training=False,
          data_format=self._data_format)

    output, mean, variance = tf_utils.smart_cond(
        training, _fused_batch_norm_training, _fused_batch_norm_inference)
    if not self._bessels_correction_test_only:
      # Remove Bessel's correction to be consistent with non-fused batch norm.
      # Note that the variance computed by fused batch norm is
      # with Bessel's correction.
      sample_size = math_ops.cast(
          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
      variance *= factor

    training_value = tf_utils.constant_value(training)
    if training_value is None:
      momentum = tf_utils.smart_cond(training,
                                     lambda: self.momentum,
                                     lambda: 1.0)
    else:
      momentum = ops.convert_to_tensor(self.momentum)
    if training_value or training_value is None:
      if distribution_strategy_context.in_cross_replica_context():
        strategy = distribution_strategy_context.get_strategy()
        mean_update = strategy.extended.update(
            self.moving_mean, self._assign_moving_average,
            (mean, self.momentum))
        variance_update = strategy.extended.update(
            self.moving_variance, self._assign_moving_average,
            (variance, self.momentum))
      else:
        mean_update = self._assign_moving_average(self.moving_mean, mean,
                                                  momentum)
        variance_update = self._assign_moving_average(self.moving_variance,
                                                      variance, momentum)
      self.add_update(mean_update, inputs=True)
      self.add_update(variance_update, inputs=True)

    return output
Beispiel #3
0
def _SparseDenseCwiseMulOrDivGrad(op, grad, is_mul):
  """Common code for SparseDenseCwise{Mul,Div} gradients."""
  x_indices = op.inputs[0]
  x_shape = op.inputs[2]
  y = op.inputs[3]

  y_shape = math_ops.to_int64(array_ops.shape(y))
  num_added_dims = array_ops.expand_dims(
      array_ops.size(x_shape) - array_ops.size(y_shape), 0)
  augmented_y_shape = array_ops.concat(
      [array_ops.ones(num_added_dims, ops.dtypes.int64), y_shape], 0)

  scaling = x_shape // augmented_y_shape
  scaled_indices = x_indices // scaling
  scaled_indices = array_ops.slice(scaled_indices,
                                   array_ops.concat([[0], num_added_dims], 0),
                                   [-1, -1])
  dense_vals = array_ops.gather_nd(y, scaled_indices)

  if is_mul:
    dx = grad * dense_vals
    dy_val = grad * op.inputs[1]
  else:
    dx = grad / dense_vals
    dy_val = grad * (-op.inputs[1] / math_ops.square(dense_vals))
  # indices can repeat after scaling, so we can't use sparse_to_dense().
  dy = sparse_ops.sparse_add(
      array_ops.zeros_like(y),
      sparse_tensor.SparseTensor(scaled_indices, dy_val, y_shape))

  # (sp_indices, sp_vals, sp_shape, dense)
  return (None, dx, None, dy)
  def quantiles_ready():
    """The subgraph for when the quantiles are ready."""
    quantized_feature = quantile_ops.quantiles([sparse_column_values], [],
                                               [quantile_buckets], [])
    quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
    quantized_feature = array_ops.reshape(quantized_feature, [-1])
    example_indices, _ = array_ops.split(
        sparse_column_indices, num_or_size_splits=2, axis=1)
    example_indices = array_ops.squeeze(example_indices, [1])
    filtered_gradients = array_ops.gather(gradients, example_indices)
    filtered_hessians = array_ops.gather(hessians, example_indices)
    filtered_partition_ids = array_ops.gather(example_partition_ids,
                                              example_indices)
    unique_partitions, mapped_partitions = array_ops.unique(
        example_partition_ids)

    # Compute aggregate stats for each partition.
    per_partition_gradients = math_ops.unsorted_segment_sum(
        gradients, mapped_partitions, array_ops.size(unique_partitions))
    per_partition_hessians = math_ops.unsorted_segment_sum(
        hessians, mapped_partitions, array_ops.size(unique_partitions))

    # Prepend a bias feature per partition that accumulates the stats for all
    # examples in that partition.
    bias_feature_ids = array_ops.fill(
        array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
    bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
    partition_ids = array_ops.concat(
        [unique_partitions, filtered_partition_ids], 0)
    filtered_gradients = array_ops.concat(
        [per_partition_gradients, filtered_gradients], 0)
    filtered_hessians = array_ops.concat(
        [per_partition_hessians, filtered_hessians], 0)
    bucket_ids = array_ops.concat([bias_feature_ids, quantized_feature], 0)
    return partition_ids, bucket_ids, filtered_gradients, filtered_hessians
Beispiel #5
0
  def _fused_batch_norm(self, inputs, training):
    """Returns the output of fused batch norm."""
    # TODO(reedwm): Add support for fp16 inputs.
    beta = self.beta if self.center else self._beta_const
    gamma = self.gamma if self.scale else self._gamma_const

    def _fused_batch_norm_training():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          epsilon=self.epsilon,
          data_format=self._data_format)

    def _fused_batch_norm_inference():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          mean=self.moving_mean,
          variance=self.moving_variance,
          epsilon=self.epsilon,
          is_training=False,
          data_format=self._data_format)

    output, mean, variance = utils.smart_cond(
        training, _fused_batch_norm_training, _fused_batch_norm_inference)
    mean = array_ops.reshape(mean, shape=self.moving_mean.get_shape())
    variance = array_ops.reshape(variance,
                                 shape=self.moving_variance.get_shape())
    if not self._bessels_correction_test_only:
      # Remove Bessel's correction to be consistent with non-fused batch norm.
      # Note that the variance computed by fused batch norm is
      # with Bessel's correction.
      sample_size = math_ops.cast(
          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
      variance *= factor

    training_value = utils.constant_value(training)
    if training_value is None:
      one_minus_decay = utils.smart_cond(training,
                                         lambda: self._one_minus_decay,
                                         lambda: 0.)
    else:
      one_minus_decay = ops.convert_to_tensor(self._one_minus_decay)
    if training_value or training_value is None:
      mean_update = self._assign_moving_average(self.moving_mean, mean,
                                                one_minus_decay)
      variance_update = self._assign_moving_average(self.moving_variance,
                                                    variance, one_minus_decay)
      if context.in_graph_mode():
        # Note that in Eager mode, the updates are already executed when running
        # assign_moving_averages. So we do not need to put them into
        # collections.
        self.add_update(mean_update, inputs=inputs)
        self.add_update(variance_update, inputs=inputs)

    return output
def _FoldFusedBatchNormGrad(op, unused_grad_y, grad_mean, grad_var, unused_1,
                            unused_2):
  x = op.inputs[0]
  n = math_ops.cast(
      array_ops.size(x) / array_ops.size(grad_mean), dtypes.float32)
  dmean_dx = grad_mean / n
  dvar_dx = 2 * grad_var * (x - op.outputs[1]) / (n - 1)
  return (dmean_dx + dvar_dx), None, None, None, None
Beispiel #7
0
def _GatherV2Grad(op, grad):
  """Gradient for GatherV2 op."""
  # params can be large, so colocate the shape calculation with it.
  #
  # params can be very large for sparse model, array_ops.shape raises
  # exception on the Windows platform when any dimension is larger than
  # int32. params_shape is not used in optimizer apply_sparse gradients,
  # so it's fine to convert it back to int32 regardless of truncation.
  params = op.inputs[0]
  with ops.colocate_with(params):
    params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
    params_shape = math_ops.to_int32(params_shape)

  indices = op.inputs[1]
  indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
  axis = op.inputs[2]
  axis_static = tensor_util.constant_value(axis)

  # For axis 0 gathers, build an appropriately shaped IndexedSlices.
  if axis_static == 0:
    values_shape = array_ops.concat([indices_size, params_shape[1:]], 0)
    values = array_ops.reshape(grad, values_shape)
    indices = array_ops.reshape(indices, indices_size)
    return [ops.IndexedSlices(values, indices, params_shape), None, None]

  outer_shape = params_shape[:axis]
  outer_dims = array_ops.size(outer_shape)
  inner_shape = params_shape[axis:][1:]
  inner_dims = array_ops.size(inner_shape)

  outer_axes_indices = math_ops.range(outer_dims)
  inner_axes_indices = math_ops.range(outer_dims + 1,
                                      outer_dims + 1 + inner_dims)

  values_shape = array_ops.concat([outer_shape, indices_size, inner_shape], 0)
  values = array_ops.reshape(grad, values_shape)
  indices = array_ops.reshape(indices, indices_size)

  # We need to sum up every slice `values[..., i, ....]` corresponding to
  # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
  # support an axis parameter, we transpose the gather dimension to the front,
  # then use `unsorted_segment_sum` to build a
  # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
  # affecting each index in `gather_axis` summed up.
  transpose_dims = array_ops.concat(
      [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
  values_transpose = array_ops.transpose(values, transpose_dims)
  num_segments = params_shape[axis]

  params_grad = math_ops.unsorted_segment_sum(
      values_transpose, indices, num_segments)

  # Inverts the above transpose by moving dimension 0 back to its original
  # position.
  invert_transpose_dims = array_ops.concat(
      [outer_axes_indices + 1, [0], inner_axes_indices], 0)
  params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
  return [params_grad, None, None]
 def testSize(self):
   with self.test_scope():
     self.assertEqual(
         1, array_ops.size(constant_op.constant(1.0)).numpy())
     self.assertEqual(
         3, array_ops.size(constant_op.constant([1.0, 2.0, 3.0])).numpy())
     self.assertEqual(
         4, array_ops.size(
             constant_op.constant([[1.0, 2.0], [3.0, 4.0]])).numpy())
 def _compareSize(self, x, use_gpu=False):
   np_ans = np.asarray(np.size(x))
   with self.test_session(use_gpu=use_gpu):
     tf_ans = array_ops.size(x)
     result = tf_ans.eval()
     tf_ans_64 = array_ops.size(x, out_type=dtypes.int64)
     result_64 = tf_ans_64.eval()
   self.assertAllEqual(np_ans, result)
   self.assertAllEqual(np_ans, result_64)
   self.assertShapeEqual(np_ans, tf_ans)
Beispiel #10
0
  def testDenseShape(self):
    t_value = [[0, 42], [24, 0]]
    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t_value)))
    self.assertEqual(4, self.evaluate(array_ops.size(t_value)))
    self.assertEqual(2, self.evaluate(array_ops.rank(t_value)))

    t = constant_op.constant(t_value)
    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(t)))
    self.assertEqual(4, self.evaluate(array_ops.size(t)))
    self.assertEqual(2, self.evaluate(array_ops.rank(t)))
  def testDenseShape(self):
    with self.test_session():
      t_value = [[0, 42], [24, 0]]
      self.assertAllEqual((2, 2), array_ops.shape(t_value).eval())
      self.assertEqual(4, array_ops.size(t_value).eval())
      self.assertEqual(2, array_ops.rank(t_value).eval())

      t = constant_op.constant(t_value)
      self.assertAllEqual((2, 2), array_ops.shape(t).eval())
      self.assertEqual(4, array_ops.size(t).eval())
      self.assertEqual(2, array_ops.rank(t).eval())
Beispiel #12
0
  def testSparseShape(self):
    sp_value = sparse_tensor.SparseTensorValue(
        indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp_value)))
    self.assertEqual(4, self.evaluate(array_ops.size(sp_value)))
    self.assertEqual(2, self.evaluate(array_ops.rank(sp_value)))

    sp = sparse_tensor.SparseTensor.from_value(sp_value)
    self.assertAllEqual((2, 2), self.evaluate(array_ops.shape(sp)))
    self.assertEqual(4, self.evaluate(array_ops.size(sp)))
    self.assertEqual(2, self.evaluate(array_ops.rank(sp)))
    def active_inputs():
      """The normal flow when the handler is active."""
      # Remove the second column of example indices matrix since it is not
      # useful.
      example_indices, _ = array_ops.split(
          self._sparse_int_column.indices, num_or_size_splits=2, axis=1)
      example_indices = array_ops.squeeze(example_indices, [1])

      filtered_gradients = array_ops.gather(gradients, example_indices)
      filtered_hessians = array_ops.gather(hessians, example_indices)
      filtered_partition_ids = array_ops.gather(example_partition_ids,
                                                example_indices)
      unique_partitions, mapped_partitions = array_ops.unique(
          example_partition_ids)

      # Compute aggregate stats for each partition.
      # The bias is computed on gradients and hessians (and not
      # filtered_gradients) which have exactly one value per example, so we
      # don't double count a gradient in multivalent columns.
      # Since unsorted_segment_sum can be numerically unstable, use 64bit
      # operation.
      gradients64 = math_ops.cast(gradients, dtypes.float64)
      hessians64 = math_ops.cast(hessians, dtypes.float64)
      per_partition_gradients = math_ops.unsorted_segment_sum(
          gradients64, mapped_partitions, array_ops.size(unique_partitions))
      per_partition_hessians = math_ops.unsorted_segment_sum(
          hessians64, mapped_partitions, array_ops.size(unique_partitions))
      per_partition_gradients = math_ops.cast(per_partition_gradients,
                                              dtypes.float32)
      per_partition_hessians = math_ops.cast(per_partition_hessians,
                                             dtypes.float32)
      # Prepend a bias feature per partition that accumulates the stats for all
      # examples in that partition.
      # Bias is added to the stats even if there are no examples with values in
      # the current sparse column. The reason is that the other example batches
      # might have values in these partitions so we have to keep the bias
      # updated.
      bias_feature_ids = array_ops.fill(
          array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
      bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
      partition_ids = array_ops.concat(
          [unique_partitions, filtered_partition_ids], 0)
      filtered_gradients = array_ops.concat(
          [per_partition_gradients, filtered_gradients], 0)
      filtered_hessians = array_ops.concat(
          [per_partition_hessians, filtered_hessians], 0)
      feature_ids = array_ops.concat(
          [bias_feature_ids, self._sparse_int_column.values], 0)
      # Dimension is always zero for sparse int features.
      dimension_ids = array_ops.zeros_like(feature_ids, dtype=dtypes.int64)
      feature_ids_and_dimensions = array_ops.stack(
          [feature_ids, dimension_ids], axis=1)
      return (partition_ids, feature_ids_and_dimensions, filtered_gradients,
              filtered_hessians)
  def testSparseShape(self):
    with self.test_session():
      sp_value = sparse_tensor.SparseTensorValue(
          indices=((0, 1), (1, 0)), values=(42, 24), dense_shape=(2, 2))
      self.assertAllEqual((2, 2), array_ops.shape(sp_value).eval())
      self.assertEqual(4, array_ops.size(sp_value).eval())
      self.assertEqual(2, array_ops.rank(sp_value).eval())

      sp = sparse_tensor.SparseTensor.from_value(sp_value)
      self.assertAllEqual((2, 2), array_ops.shape(sp).eval())
      self.assertEqual(4, array_ops.size(sp).eval())
      self.assertEqual(2, array_ops.rank(sp).eval())
Beispiel #15
0
  def pack(self, grouped_grads_and_vars):
    """Pack tensors."""
    self.grouped_grads_and_vars = grouped_grads_and_vars
    self.all_tower_shapes = []
    self.all_tower_sizes = []

    device_grad_packs = []
    for tower_grads_and_vars in grouped_grads_and_vars:
      with ops.colocate_with(tower_grads_and_vars[0][0]):
        # Flatten all the grads.
        flat_grads = [
            array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
        ]
        # Remember the original shape of all the grads.
        tower_shapes = [array_ops.shape(g) for g, _ in tower_grads_and_vars]
        # Remember the original sizes of all the grads.
        tower_sizes = [array_ops.size(g) for g, _ in tower_grads_and_vars]
        # Concat all the flat grads into a big flat tensor.
        concat_grads = array_ops.concat(flat_grads, 0)

        # Split the big tensor into num_splits packs. In cases where the
        # total size is not divisible num_splits, the last pack gets
        # more elements.
        # TODO(zhengxq): it is also possible to optimize away all the concat
        # as well.
        num_splits = self.num_packs

        # The array_ops.size function will sometimes remove static shapes. So if
        # all gradient shapes are defined, we use another method to get the
        # total size.
        # TODO(yuefengz): move this logic to array_ops.size.
        if all([g.shape.is_fully_defined() for g, _ in tower_grads_and_vars]):
          total_grad_size = sum(
              [g.shape.num_elements() for g, _ in tower_grads_and_vars])
        else:
          total_grad_size = array_ops.size(concat_grads)

        split_size = total_grad_size // num_splits
        split_size_last = total_grad_size - split_size * (num_splits - 1)
        split_sizes = [split_size] * (num_splits - 1) + [split_size_last]
        grad_packs = array_ops.split(concat_grads, split_sizes)

        # Ready to aggregate the repacked gradients, with fake variables.
        # TODO(zhengxq): It is hacky to have to use fake variables.
        # We should remove the need for variables in
        # aggregate_gradients_using*.
        device_grad_packs.append(zip(grad_packs, [None] * num_splits))
        self.all_tower_shapes.append(tower_shapes)
        self.all_tower_sizes.append(tower_sizes)

    return device_grad_packs
  def quantiles_ready():
    """The subgraph for when the quantiles are ready."""
    quantized_feature = quantile_ops.quantiles([], [sparse_column_values], [],
                                               [quantile_buckets],
                                               [sparse_column_indices])

    quantized_feature = math_ops.cast(quantized_feature[1], dtypes.int64)
    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)

    example_indices, _ = array_ops.split(
        sparse_column_indices, num_or_size_splits=2, axis=1)
    example_indices = array_ops.squeeze(example_indices, [1])
    filtered_gradients = array_ops.gather(gradients, example_indices)
    filtered_hessians = array_ops.gather(hessians, example_indices)
    filtered_partition_ids = array_ops.gather(example_partition_ids,
                                              example_indices)
    unique_partitions, mapped_partitions = array_ops.unique(
        example_partition_ids)

    # Compute aggregate stats for each partition.
    # Since unsorted_segment_sum can be numerically unstable, use 64bit
    # operation.
    gradients64 = math_ops.cast(gradients, dtypes.float64)
    hessians64 = math_ops.cast(hessians, dtypes.float64)
    per_partition_gradients = math_ops.unsorted_segment_sum(
        gradients64, mapped_partitions, array_ops.size(unique_partitions))
    per_partition_hessians = math_ops.unsorted_segment_sum(
        hessians64, mapped_partitions, array_ops.size(unique_partitions))
    per_partition_gradients = math_ops.cast(per_partition_gradients,
                                            dtypes.float32)
    per_partition_hessians = math_ops.cast(per_partition_hessians,
                                           dtypes.float32)
    # Prepend a bias feature per partition that accumulates the stats for all
    # examples in that partition.
    bias_feature_ids = array_ops.fill(
        array_ops.shape(unique_partitions), _BIAS_FEATURE_ID)
    bias_feature_ids = math_ops.cast(bias_feature_ids, dtypes.int64)
    zeros = array_ops.zeros_like(bias_feature_ids)
    bias_feature_ids = array_ops.stack([bias_feature_ids, zeros], axis=1)

    partition_ids = array_ops.concat(
        [unique_partitions, filtered_partition_ids], 0)
    filtered_gradients = array_ops.concat(
        [per_partition_gradients, filtered_gradients], 0)
    filtered_hessians = array_ops.concat(
        [per_partition_hessians, filtered_hessians], 0)

    bucket_ids = array_ops.concat([bias_feature_ids, quantized_feature], 0)

    return partition_ids, bucket_ids, filtered_gradients, filtered_hessians
Beispiel #17
0
  def _update_mask(self, weights, threshold):
    """Updates the mask for a given weight tensor.

    This functions first computes the cdf of the weight tensor, and estimates
    the threshold value such that 'desired_sparsity' fraction of weights
    have magnitude less than the threshold.

    Args:
      weights: The weight tensor that needs to be masked.
      threshold: The current threshold value. The function will compute a new
        threshold and return the exponential moving average using the current
        value of threshold

    Returns:
      new_threshold: The new value of the threshold based on weights, and
        sparsity at the current global_step
      new_mask: A numpy array of the same size and shape as weights containing
        0 or 1 to indicate which of the values in weights falls below
        the threshold

    Raises:
      ValueError: if sparsity is not defined
    """
    if self._sparsity is None:
      raise ValueError('Sparsity variable undefined')

    sparsity = self._get_sparsity(weights.op.name)
    with ops.name_scope(weights.op.name + '_pruning_ops'):
      abs_weights = math_ops.abs(weights)
      k = math_ops.cast(
          math_ops.round(
              math_ops.cast(array_ops.size(abs_weights), dtypes.float32) *
              (1 - sparsity)), dtypes.int32)
      # Sort the entire array
      values, _ = nn_ops.top_k(
          array_ops.reshape(abs_weights, [-1]), k=array_ops.size(abs_weights))
      # Grab the (k-1) th value
      current_threshold = array_ops.gather(values, k - 1)
      smoothed_threshold = math_ops.add_n([
          math_ops.multiply(current_threshold, 1 - self._spec.threshold_decay),
          math_ops.multiply(threshold, self._spec.threshold_decay)
      ])

      new_mask = math_ops.cast(
          math_ops.greater_equal(abs_weights, smoothed_threshold),
          dtypes.float32)

    return smoothed_threshold, new_mask
Beispiel #18
0
  def __init__(self, embedding, start_tokens, end_token):
    """Initializer.

    Args:
      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
        or the `params` argument for `embedding_lookup`. The returned tensor
        will be passed to the decoder input.
      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
      end_token: `int32` scalar, the token that marks end of decoding.

    Raises:
      ValueError: if `start_tokens` is not a 1D tensor or `end_token` is not a
        scalar.
    """
    if callable(embedding):
      self._embedding_fn = embedding
    else:
      self._embedding_fn = (
          lambda ids: embedding_ops.embedding_lookup(embedding, ids))

    self._start_tokens = ops.convert_to_tensor(
        start_tokens, dtype=dtypes.int32, name="start_tokens")
    self._end_token = ops.convert_to_tensor(
        end_token, dtype=dtypes.int32, name="end_token")
    if self._start_tokens.get_shape().ndims != 1:
      raise ValueError("start_tokens must be a vector")
    self._batch_size = array_ops.size(start_tokens)
    if self._end_token.get_shape().ndims != 0:
      raise ValueError("end_token must be a scalar")
    self._start_inputs = self._embedding_fn(self._start_tokens)
  def _broadcast_uniform_partitioned_dimension(self, axis, lengths):
    """Broadcasts the partitioned dimension `axis` to match `lengths`."""
    axis_dim_size = self.dimension_size(axis)
    partitioned_sizes = list(self._partitioned_dim_sizes[:axis])

    if lengths.shape.ndims == 0:
      lengths = array_ops.where(
          math_ops.equal(axis_dim_size, 1), lengths, axis_dim_size)
      repeats = array_ops.where(math_ops.equal(axis_dim_size, 1), lengths, 1)
      splits = array_ops.stack([0, self.num_slices_in_dimension(axis)])
    else:
      splits = math_ops.range(
          array_ops.size(lengths, out_type=self.dim_size_dtype) + 1)
      repeats = lengths

    partitioned_sizes.append(lengths)

    for dim_size in self._partitioned_dim_sizes[axis + 1:]:
      if dim_size.shape.ndims == 0:
        partitioned_sizes.append(dim_size)
        splits *= dim_size
      else:
        partitioned_sizes.append(
            ragged_util.repeat_ranges(dim_size, splits, repeats))
        splits = array_ops.gather(
            ragged_util.lengths_to_splits(dim_size), splits)
    inner_sizes = self._inner_dim_sizes
    return RaggedTensorDynamicShape(partitioned_sizes, inner_sizes)
Beispiel #20
0
def _flip_vector_to_matrix_dynamic(vec, batch_shape):
  """flip_vector_to_matrix with dynamic shapes."""
  # Shapes associated with batch_shape
  batch_rank = array_ops.size(batch_shape)

  # Shapes associated with vec.
  vec = ops.convert_to_tensor(vec, name="vec")
  vec_shape = array_ops.shape(vec)
  vec_rank = array_ops.rank(vec)
  vec_batch_rank = vec_rank - 1

  m = vec_batch_rank - batch_rank
  # vec_shape_left = [M1,...,Mm] or [].
  vec_shape_left = array_ops.slice(vec_shape, [0], [m])
  # If vec_shape_left = [], then condensed_shape = [1] since reduce_prod([]) = 1
  # If vec_shape_left = [M1,...,Mm], condensed_shape = [M1*...*Mm]
  condensed_shape = [math_ops.reduce_prod(vec_shape_left)]
  k = array_ops.gather(vec_shape, vec_rank - 1)
  new_shape = array_ops.concat(0, (batch_shape, [k], condensed_shape))

  def _flip_front_dims_to_back():
    # Permutation corresponding to [N1,...,Nn] + [k, M1,...,Mm]
    perm = array_ops.concat(
        0, (math_ops.range(m, vec_rank), math_ops.range(0, m)))
    return array_ops.transpose(vec, perm=perm)

  x_flipped = control_flow_ops.cond(
      math_ops.less(0, m),
      _flip_front_dims_to_back,
      lambda: array_ops.expand_dims(vec, -1))

  return array_ops.reshape(x_flipped, new_shape)
def dense_make_stats_update(is_active, are_buckets_ready, float_column,
                            quantile_buckets, example_partition_ids, gradients,
                            hessians, weights, empty_gradients, empty_hessians):
  """Updates the state for dense split handler."""
  empty_float = constant_op.constant_v1([], dtype=dtypes.float32)

  quantile_values, quantile_weights = control_flow_ops.cond(
      is_active[1],  # For the next layer, this handler is inactive.
      lambda: (float_column, weights),
      lambda: (empty_float, empty_float))

  def ready_inputs_fn():
    """Branch to execute when quantiles are ready."""
    quantized_feature = quantile_ops.quantiles([float_column], [],
                                               [quantile_buckets], [], [])
    quantized_feature = math_ops.cast(quantized_feature[0], dtypes.int64)
    quantized_feature = array_ops.squeeze(quantized_feature, axis=0)
    return (example_partition_ids, quantized_feature, gradients, hessians)

  def not_ready_inputs_fn():
    return (constant_op.constant_v1([], dtype=dtypes.int32),
            constant_op.constant_v1([[]], dtype=dtypes.int64, shape=[1, 2]),
            empty_gradients, empty_hessians)

  example_partition_ids, feature_ids, gradients, hessians = (
      control_flow_ops.cond(
          math_ops.logical_and(
              math_ops.logical_and(are_buckets_ready,
                                   array_ops.size(quantile_buckets) > 0),
              is_active[0]), ready_inputs_fn, not_ready_inputs_fn))
  return (quantile_values, quantile_weights, example_partition_ids, feature_ids,
          gradients, hessians)
 def testEmptyInput(self):
   with self.test_session():
     x = array_ops.placeholder(dtypes.float32, shape=[0, 3])
     self.assertEqual(0, array_ops.size(x).eval())
     # reshape would raise if logits is empty
     with self.assertRaises(errors_impl.InvalidArgumentError):
       nn_ops.softmax(x, axis=0).eval()
 def _apply_sparse(self, grad, var):
   if len(grad.indices.get_shape()) == 1:
     grad_indices = grad.indices
     grad_values = grad.values
   else:
     grad_indices = array_ops.reshape(grad.indices, [-1])
     grad_values = array_ops.reshape(grad.values, [-1, grad.values.get_shape()[-1].value])
   gidxs, metagidxs = array_ops.unique(grad_indices)
   sizegidxs = array_ops.size(gidxs)
   gvals = math_ops.unsorted_segment_sum(grad_values, metagidxs, sizegidxs)
   # m_t = mu * m + (1 - mu) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = gvals * (1 - self._mu_t)
   m_t = state_ops.scatter_update(m, gidxs,
                                  array_ops.gather(m, gidxs) * self._mu_t,
                                  use_locking=self._use_locking)
   m_t = state_ops.scatter_add(m_t, gidxs, m_scaled_g_values,
                               use_locking=self._use_locking)
   m_t_ = array_ops.gather(m_t, gidxs) / (1 - self._mu2_t * self._mu_power)
   # m_bar = mu * m_t + (1 - mu) * g_t
   m_bar = self._mu2_t * m_t_ + m_scaled_g_values / (1 - self._mu_power)
   var_update = state_ops.scatter_sub(var, gidxs,
                                    self._lr_t * m_bar,
                                    use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t])
Beispiel #24
0
def index_to_string_table_from_tensor(vocabulary_list,
                                      default_value="UNK",
                                      name=None):
  """Returns a lookup table that maps a `Tensor` of indices into strings.

  This operation constructs a lookup table to map int64 indices into string
  values. The mapping is initialized from a string `mapping` 1-D `Tensor` where
  each element is a value and the corresponding index within the tensor is the
  key.

  Any input which does not have a corresponding index in 'mapping'
  (an out-of-vocabulary entry) is assigned the `default_value`

  The underlying table must be initialized by calling
  `tf.tables_initializer.run()` or `table.init.run()` once.

  Elements in `mapping` cannot have duplicates, otherwise when executing the
  table initializer op, it will throw a `FailedPreconditionError`.

  Sample Usages:

  ```python
  vocabulary_list = tf.constant(["emerson", "lake", "palmer"])
  indices = tf.constant([1, 5], tf.int64)
  table = tf.contrib.lookup.index_to_string_table_from_tensor(
      vocabulary_list, default_value="UNKNOWN")
  values = table.lookup(indices)
  ...
  tf.tables_initializer().run()

  values.eval() ==> ["lake", "UNKNOWN"]
  ```

  Args:
    vocabulary_list: A 1-D string `Tensor` that specifies the strings to map
      from indices.
    default_value: The value to use for out-of-vocabulary indices.
    name: A name for this op (optional).

  Returns:
    The lookup table to map a string values associated to a given index `int64`
    `Tensors`.

  Raises:
    ValueError: when `vocabulary_list` is not set.
  """

  if vocabulary_list is None:
    raise ValueError("vocabulary_list must be specified.")

  with ops.name_scope(name, "index_to_string") as scope:
    vocabulary_list = ops.convert_to_tensor(vocabulary_list, dtypes.string)
    num_elements = array_ops.size(vocabulary_list)
    keys = math_ops.to_int64(math_ops.range(num_elements))

    shared_name = ""
    init = KeyValueTensorInitializer(
        keys, vocabulary_list, dtypes.int64, dtypes.string, name="table_init")
    # TODO(yleon): Use a more effienct structure.
    return HashTable(init, default_value, shared_name=shared_name, name=scope)
Beispiel #25
0
  def call(self, values, weights=None):
    """Accumulate statistics for computing the mean.

    For example, if values is [1, 3, 5, 7] then the mean is 4.
    If the weights were specified as [1, 1, 0, 0] then the mean would be 2.

    Args:
      values: Tensor with the per-example value.
      weights: Optional weighting of each example. Defaults to 1.
    """
    if not self.built:  # False only in the first call().
      self.numer = self.add_variable(name="numer", shape=(),
                                     dtype=dtypes.float64,
                                     initializer=init_ops.zeros_initializer)
      self.denom = self.add_variable(name="denom", shape=(),
                                     dtype=dtypes.float64,
                                     initializer=init_ops.zeros_initializer)
    if weights is None:
      self.denom.assign_add(
          math_ops.cast(array_ops.size(values), dtypes.float64))
      values = math_ops.reduce_sum(values)
      self.numer.assign_add(math_ops.cast(values, dtypes.float64))
    else:
      weights = math_ops.cast(weights, dtypes.float64)
      self.denom.assign_add(math_ops.reduce_sum(weights))
      values = math_ops.cast(values, dtypes.float64) * weights
      self.numer.assign_add(math_ops.reduce_sum(values))
 def testEmptyInput(self):
   with self.test_session():
     x = constant_op.constant([[]], shape=[0, 3])
     self.assertEqual(0, array_ops.size(x).eval())
     # reshape would raise if logits is empty
     with self.assertRaises(errors_impl.InvalidArgumentError):
       nn_ops.softmax(x, dim=0).eval()
Beispiel #27
0
  def call(self, values, weights=None):
    """Accumulate statistics for computing the mean.

    For example, if values is [1, 3, 5, 7] then the mean is 4.
    If the weights were specified as [1, 1, 0, 0] then the mean would be 2.

    Args:
      values: Tensor with the per-example value.
      weights: Optional weighting of each example. Defaults to 1.

    Returns:
      The arguments, for easy chaining.
    """
    if weights is None:
      self.denom.assign_add(
          math_ops.cast(array_ops.identity(array_ops.size(values)), self.dtype))
      values = math_ops.reduce_sum(values)
      self.numer.assign_add(math_ops.cast(values, self.dtype))
    else:
      weights = math_ops.cast(weights, self.dtype)
      self.denom.assign_add(math_ops.reduce_sum(weights))
      values = math_ops.cast(values, self.dtype) * weights
      self.numer.assign_add(math_ops.reduce_sum(values))
    if weights is None:
      return values
    return values, weights
Beispiel #28
0
 def approximate_hessian(self, grads_and_vars, name=None):
   """
   I haven't tested this yet so I have no idea if it works, but even if it
   does it's probably super slow, and either way nothing else has been modified
   to deal with it.
   """
   
   gv = 0
   var_refs = []
   for g_t, x_tm1 in grads_and_vars:
     var_refs.append(x_tm1.ref())
     if g_t is None:
       continue
     with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device):
       if isinstance(g_t, ops.Tensor):
         gv += math_ops.reduce_sum(g_t * random_ops.random_normal(g_t.get_shape()))
       else:
         idxs, idxs_ = array_ops.unique(g_t.indices)
         g_t_ = math_ops.unsorted_segment_sum(g_t.values, idxs_, array_ops.size(idxs))
         gv += math_ops.reduce_sum(g_t_ * random_ops.random_normal(g_t_.get_shape()))
   hesses = gradients.gradients(gv, var_refs,
                                gate_gradients=(gate_gradients == Optimizer.GATE_OP),
                                aggregation_method=aggregation_method,
                                colocate_gradients_with_ops=colocate_gradients_with_ops)
   return zip([g_t for g_t, _ in grads_and_vars], [x_tm1 for _, x_tm1 in grads_and_vars], hesses)
Beispiel #29
0
  def __init__(self, inputs, sequence_length, time_major=False, name=None):
    """Initializer.

    Args:
      inputs: A (structure of) input tensors.
      sequence_length: An int32 vector tensor.
      time_major: Python bool.  Whether the tensors in `inputs` are time major.
        If `False` (default), they are assumed to be batch major.
      name: Name scope for any created operations.

    Raises:
      ValueError: if `sequence_length` is not a 1D tensor.
    """
    with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
      inputs = ops.convert_to_tensor(inputs, name="inputs")
      if not time_major:
        inputs = nest.map_structure(_transpose_batch_time, inputs)

      self._input_tas = nest.map_structure(_unstack_ta, inputs)
      self._sequence_length = ops.convert_to_tensor(
          sequence_length, name="sequence_length")
      if self._sequence_length.get_shape().ndims != 1:
        raise ValueError(
            "Expected sequence_length to be a vector, but received shape: %s" %
            self._sequence_length.get_shape())

      self._zero_inputs = nest.map_structure(
          lambda inp: array_ops.zeros_like(inp[0, :]), inputs)

      self._batch_size = array_ops.size(sequence_length)
Beispiel #30
0
def _TileGrad(op, grad):
  """Sum reduces grad along the tiled dimensions."""
  input_shape = array_ops.shape(op.inputs[0])
  # We interleave multiples and input_shape to get split_shape,
  # reshape grad to split_shape, and reduce along all even
  # dimensions (the tiled dimensions) to get the result
  # with shape input_shape.  For example
  #   input_shape = [20, 30, 40]
  #   multiples = [2, 3, 4]
  #   split_shape = [2, 20, 3, 30, 4, 40]
  #   axes = [0, 2, 4]
  split_shape = array_ops.reshape(
      array_ops.transpose(array_ops.stack([op.inputs[1], input_shape])), [-1])
  axes = math_ops.range(0, array_ops.size(split_shape), 2)
  # Sum reduces grad along the first dimension for IndexedSlices
  if isinstance(grad, ops.IndexedSlices):
    grad = math_ops.unsorted_segment_sum(
        grad.values,
        math_ops.mod(grad.indices, input_shape[0]),
        input_shape[0])
    split_shape = array_ops.concat([[1], split_shape[1:]], axis=0)
  input_grad = math_ops.reduce_sum(array_ops.reshape(grad, split_shape), axes)
  # Fix shape inference
  if not context.executing_eagerly():
    input_grad.set_shape(op.inputs[0].get_shape())
  return [input_grad, None]
def _MatrixSquareRootGrad(op, grad):
  """Gradient for MatrixSquareRoot."""

  # Let A be an m x m square matrix (or batch of matrices)
  # Let R = sqrtm(A)
  # By definition, A = RR
  # Take the differential: dA = d(RR) = RdR + dRR
  # Solve the resulting Sylvester equation for dR

  # Used to find Kronecker products within the Sylvester equation
  def _KroneckerProduct(b1, b2):
    """Computes the Kronecker product of two batches of square matrices."""
    b1_shape = array_ops.shape(b1)
    b2_shape = array_ops.shape(b2)
    b1_order = b1_shape[-1]
    b2_order = b2_shape[-1]

    shape_slice_size = [math_ops.subtract(array_ops.size(b1_shape), 2)]
    shape_slice = array_ops.slice(b1_shape, [0],
                                  shape_slice_size)  # Same for both batches
    b1_reshape_shape = array_ops.concat(
        [shape_slice, [b1_order], [1], [b1_order], [1]], 0)
    b2_reshape_shape = array_ops.concat(
        [shape_slice, [1], [b2_order], [1], [b2_order]], 0)

    b1_reshape = array_ops.reshape(b1, b1_reshape_shape)
    b2_reshape = array_ops.reshape(b2, b2_reshape_shape)

    order_prod = b1_order * b2_order
    kprod_shape = array_ops.concat([shape_slice, [order_prod], [order_prod]], 0)
    return array_ops.reshape(b1_reshape * b2_reshape, kprod_shape)

  sqrtm = op.outputs[0]  # R
  shape = array_ops.shape(sqrtm)
  order = shape[-1]  # m
  matrix_count = math_ops.reduce_prod(shape[0:-2])

  # Get batch of m x m identity matrices
  eye = linalg_ops.eye(order, dtype=sqrtm.dtype)  # m x m identity matrix
  eye_flat = array_ops.reshape(eye, [-1])
  eye_tiled = array_ops.tile(eye_flat, [matrix_count])
  eye_batch = array_ops.reshape(eye_tiled, shape)

  # The transpose of R is taken in the k1 term instead of k2 in
  # order to prevent redundant transposition of R (i.e. (R')' = R)
  sqrtm_transpose = array_ops.matrix_transpose(sqrtm)
  k1 = _KroneckerProduct(eye_batch, sqrtm_transpose)
  k2 = _KroneckerProduct(sqrtm, eye_batch)
  ksum = math_ops.add(k1, k2)

  # Vectorize dA
  shape_slice_size = [math_ops.subtract(array_ops.size(shape), 2)]
  shape_slice = array_ops.slice(shape, [0], shape_slice_size)
  shape_vec_da = array_ops.concat([shape_slice, [order * order], [1]], 0)
  vec_da = array_ops.reshape(array_ops.matrix_transpose(grad), shape_vec_da)

  # Solve for vec(dR)
  vec_dsqrtm = linalg_ops.matrix_solve(ksum, vec_da)

  # Solve for dR by inverse vectorizing vec(dR)
  dsqrtm_transpose = array_ops.reshape(vec_dsqrtm, shape)
  return array_ops.matrix_transpose(dsqrtm_transpose)
Beispiel #32
0
def embedding_lookup(params, ids, partition_strategy="mod", name=None,
                     validate_indices=True):
  """Looks up `ids` in a list of embedding tensors.

  This function is used to perform parallel lookups on the list of
  tensors in `params`.  It is a generalization of
  [`tf.gather()`](../../api_docs/python/array_ops.md#gather), where `params` is
  interpreted as a partition of a larger embedding tensor.

  If `len(params) > 1`, each element `id` of `ids` is partitioned between
  the elements of `params` according to the `partition_strategy`.
  In all strategies, if the id space does not evenly divide the number of
  partitions, each of the first `(max_id + 1) % len(params)` partitions will
  be assigned one more id.

  If `partition_strategy` is `"mod"`, we assign each id to partition
  `p = id % len(params)`. For instance,
  13 ids are split across 5 partitions as:
  `[[0, 5, 10], [1, 6, 11], [2, 7, 12], [3, 8], [4, 9]]`

  If `partition_strategy` is `"div"`, we assign ids to partitions in a
  contiguous manner. In this case, 13 ids are split across 5 partitions as:
  `[[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10], [11, 12]]`

  The results of the lookup are concatenated into a dense
  tensor. The returned tensor has shape `shape(ids) + shape(params)[1:]`.

  Args:
    params: A list of tensors with the same type and which can be concatenated
      along dimension 0. Each `Tensor` must be appropriately sized for the given
      `partition_strategy`.
    ids: A `Tensor` with type `int32` or `int64` containing the ids to be looked
      up in `params`.
    partition_strategy: A string specifying the partitioning strategy, relevant
      if `len(params) > 1`. Currently `"div"` and `"mod"` are supported. Default
      is `"mod"`.
    name: A name for the operation (optional).
    validate_indices: Whether or not to validate gather indices.

  Returns:
    A `Tensor` with the same type as the tensors in `params`.

  Raises:
    ValueError: If `params` is empty.
  """
  if not isinstance(params, list):
    params = [params]
  with ops.op_scope(params + [ids], name, "embedding_lookup") as name:
    if not params:
      raise ValueError("Need at least one param")
    np = len(params)  # Number of partitions
    params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
    if np == 1:
      with ops.colocate_with(params[0]):
        return array_ops.gather(params[0], ids, name=name,
                                validate_indices=validate_indices)
    else:
      ids = ops.convert_to_tensor(ids, name="ids")
      flat_ids = array_ops.reshape(ids, [-1])
      original_indices = math_ops.range(array_ops.size(flat_ids))

      # Create p_assignments and set new_ids depending on the strategy.
      if partition_strategy == "mod":
        p_assignments = flat_ids % np
        new_ids = flat_ids // np
      elif partition_strategy == "div":
        # Compute num_total_ids as the sum of dim-0 of params, then assign to
        # partitions based on a constant number of ids per partition. Optimize
        # if we already know the full shape statically.
        dim_0_size = params[0].get_shape()[0]
        for p in xrange(1, np):
          dim_0_size += params[p].get_shape()[0]
        if dim_0_size.value:
          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
        else:
          dim_0_sizes = []
          for p in xrange(np):
            if params[p].get_shape()[0].value is not None:
              dim_0_sizes.append(params[p].get_shape()[0].value)
            else:
              with ops.colocate_with(params[p]):
                dim_0_sizes.append(array_ops.shape(params[p])[0])
          num_total_ids = math_ops.reduce_sum(
              math_ops.cast(array_ops.pack(dim_0_sizes), flat_ids.dtype))
        ids_per_partition = num_total_ids // np
        extras = num_total_ids % np

        p_assignments = math_ops.maximum(
            flat_ids // (ids_per_partition + 1),
            (flat_ids - extras) // ids_per_partition)

        # Emulate a conditional using a boolean indicator tensor
        is_in_first_extras_partitions = math_ops.cast(
            p_assignments < extras, flat_ids.dtype)
        new_ids = (
            is_in_first_extras_partitions * (
                flat_ids % (ids_per_partition + 1)) +
            (1 - is_in_first_extras_partitions) * (
                (flat_ids - extras) % ids_per_partition))
      else:
        raise ValueError("Unrecognized partition strategy: " +
                         partition_strategy)

      # Cast partition assignments to int32 for use in dynamic_partition.
      # There really should not be more than 2^32 partitions.
      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
      # Partition list of ids based on assignments into np separate lists
      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
      # Similarly, partition the original indices.
      pindices = data_flow_ops.dynamic_partition(original_indices,
                                                 p_assignments, np)
      # Do np separate lookups, finding embeddings for plist[p] in params[p]
      partitioned_result = []
      for p in xrange(np):
        with ops.colocate_with(params[p]):
          partitioned_result.append(array_ops.gather(
              params[p], gather_ids[p],
              validate_indices=validate_indices))
      # Stitch these back together
      ret = data_flow_ops.dynamic_stitch(pindices, partitioned_result,
                                         name=name)
      # Reshape to reverse the flattening of ids.
      element_shape = params[0].get_shape()[1:]
      for p in params[1:]:
        element_shape = element_shape.merge_with(p.get_shape()[1:])
      if element_shape.is_fully_defined():
        ret = array_ops.reshape(ret, array_ops.concat(0, [
            array_ops.shape(ids), element_shape]))
      else:
        # It's important that we compute params[0].shape on the right device
        # to avoid data motion.
        with ops.colocate_with(params[0]):
          params_shape = array_ops.shape(params[0])
        ret = array_ops.reshape(ret, array_ops.concat(0, [
            array_ops.shape(ids), array_ops.slice(params_shape, [1], [-1])]))
      # output shape = ids.shape + params[*].shape[1:]
      # Normally the reshape is sufficient, but setting shape explicitly
      # teaches shape inference that params[1:].get_shape() matters.
      ret.set_shape(ids.get_shape().concatenate(element_shape))
      return ret
 def _shape(self):
   d_shape = array_ops.shape(self._diag)
   k = array_ops.gather(d_shape, array_ops.size(d_shape) - 1)
   return array_ops.concat_v2((d_shape, [k]), 0)
def _FindFusedBatchNorms(graph):
    """Finds all ops and tensors related to found FusedBatchNorms.

  Args:
    graph: Graph to inspect.

  Yields:
    _FusedBatchNormMatches.
  """
    input_pattern = graph_matcher.OpTypePattern('*')
    weight_pattern = graph_matcher.OpTypePattern('*')
    gamma_pattern = graph_matcher.OpTypePattern('*')
    beta_pattern = graph_matcher.OpTypePattern('*')
    mean_pattern = graph_matcher.OpTypePattern('*')
    variance_pattern = graph_matcher.OpTypePattern('*')

    moving_average_pattern = graph_matcher.OpTypePattern('*')
    bn_decay_pattern = graph_matcher.OpTypePattern('*')
    layer_pattern = graph_matcher.OpTypePattern(
        'Conv2D|DepthwiseConv2dNative|MatMul',
        inputs=[input_pattern, weight_pattern])
    # MatMul has a Reshape between it and FusedBatchNorm.
    matmul_reshape_pattern = graph_matcher.OpTypePattern(
        'Reshape', inputs=[layer_pattern,
                           graph_matcher.OpTypePattern('*')])

    batch_norm_pattern = graph_matcher.OpTypePattern(
        'FusedBatchNorm',
        inputs=[
            graph_matcher.OneofPattern([matmul_reshape_pattern,
                                        layer_pattern]), gamma_pattern,
            beta_pattern, mean_pattern, variance_pattern
        ])
    matmul_bn_output_reshape_pattern = graph_matcher.OpTypePattern(
        'Reshape',
        inputs=[batch_norm_pattern,
                graph_matcher.OpTypePattern('*')])

    bn_matcher = graph_matcher.GraphMatcher(
        graph_matcher.OneofPattern(
            [matmul_bn_output_reshape_pattern, batch_norm_pattern]))

    moving_average_sub_pattern = graph_matcher.OpTypePattern(
        'Sub', inputs=[moving_average_pattern, batch_norm_pattern])
    moving_average_mul_pattern = graph_matcher.OpTypePattern(
        'Mul', inputs=[moving_average_sub_pattern, bn_decay_pattern])

    moving_avg_mul_matcher = graph_matcher.GraphMatcher(
        moving_average_mul_pattern)

    for match_result in bn_matcher.match_graph(graph):
        moving_mean_tensor = None
        moving_variance_tensor = None
        bn_decay_mean_tensor = None
        bn_decay_var_tensor = None
        layer_op = match_result.get_op(layer_pattern)
        layer_tensor = match_result.get_tensor(layer_pattern)
        bn_op = match_result.get_op(batch_norm_pattern)
        batch_epsilon = bn_op.get_attr('epsilon')

        # In the MatMul case, the output of batch norm is reshaped back into a
        # 2D tensor, so the output_tensor is the output of the Reshape op.
        output_tensor = bn_op.outputs[0]
        if layer_op.type == 'MatMul':
            output_reshape_op = match_result.get_op(
                matmul_bn_output_reshape_pattern)
            # If the matcher didn't match matmul_bn_output_reshape, there will be
            # another match for this 'MatMul' later, so we can skip this one.
            if output_reshape_op is None:
                continue
            output_tensor = output_reshape_op.outputs[0]

        # Ensure that the output tensor has consumers, otherwise this is a dangling
        # node and not a match.
        if not output_tensor.consumers():
            continue

        input_tensor = match_result.get_tensor(input_pattern)
        weight_tensor = match_result.get_tensor(weight_pattern)
        gamma_tensor = match_result.get_tensor(gamma_pattern)
        beta_tensor = match_result.get_tensor(beta_pattern)
        # FusedBatchNorm in training is different from that in inference. It takes
        # empty 'mean' and empty 'variance', and produces the mean and the variance
        # of the batch. Therefore, when is_training is true, mean_tensor and
        # variance_tensor point to 1st and 2nd (0-based) output of bn_op,
        # respectively; when is_training is false, they point to bn_op's inputs.
        is_training = bn_op.get_attr('is_training')
        if is_training:
            # FusedBatchNormGrad doesn't compute gradients of the batch_mean and
            # batch_variance outputs, so we need to substitute our own custom
            # gradient.
            # TODO(suharshs, raghuramank): Find a way to avoid needing this hack.
            # pylint: disable=protected-access
            bn_op._set_attr(
                '_gradient_op_type',
                attr_value_pb2.AttrValue(
                    s=compat.as_bytes('FoldFusedBatchNormGrad')))
            # pylint: enable=protected-access
            mean_tensor = bn_op.outputs[1]
            # The batch variance used during forward and backward prop is biased,
            # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average
            # calculation, the variance is corrected by the term N/N-1 (Bessel's
            # correction). The variance tensor read from FuseBatchNorm has Bessel's
            # correction applied, so we undo it here.
            scope, sep, _ = bn_op.name.rpartition('/')
            g = ops.get_default_graph()
            with g.as_default(), g.name_scope(scope + sep):
                n = math_ops.cast(
                    array_ops.size(layer_tensor) / array_ops.size(mean_tensor),
                    dtypes.float32)
                variance_tensor = math_ops.multiply(
                    bn_op.outputs[2], (n - 1) / n,
                    name='Undo_Bessel_Correction')
            # TODO(suharshs): Find a way to get rid of this inner match.
            for mul_match_result in moving_avg_mul_matcher.match_graph(graph):
                sub_op = mul_match_result.get_op(moving_average_sub_pattern)
                if sub_op.inputs[1].name == bn_op.outputs[1].name:
                    # During training: Batch Mean is bn_op.outputs[1]
                    moving_mean_tensor = sub_op.inputs[0]
                    bn_decay_mean_tensor = mul_match_result.get_tensor(
                        bn_decay_pattern)
                if sub_op.inputs[1].name == bn_op.outputs[2].name:
                    # During training: Batch Var is bn_op.outputs[2]
                    moving_variance_tensor = sub_op.inputs[0]
                    bn_decay_var_tensor = mul_match_result.get_tensor(
                        bn_decay_pattern)
        else:
            mean_tensor = match_result.get_tensor(mean_pattern)
            variance_tensor = match_result.get_tensor(variance_pattern)

        yield _BatchNormMatch(layer_op=layer_op,
                              bn_op=bn_op,
                              output_tensor=output_tensor,
                              input_tensor=input_tensor,
                              weight_tensor=weight_tensor,
                              gamma_tensor=gamma_tensor,
                              beta_tensor=beta_tensor,
                              mean_tensor=mean_tensor,
                              variance_tensor=variance_tensor,
                              moving_mean_tensor=moving_mean_tensor,
                              moving_variance_tensor=moving_variance_tensor,
                              bn_decay_mean_tensor=bn_decay_mean_tensor,
                              bn_decay_var_tensor=bn_decay_var_tensor,
                              batch_epsilon=batch_epsilon)
Beispiel #35
0
def _GatherV2Grad(op, grad):
    """Gradient for GatherV2 op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.to_int32(params_shape)

    indices = op.inputs[1]
    indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
    axis = op.inputs[2]
    axis_static = tensor_util.constant_value(axis)

    # For axis 0 gathers, build an appropriately shaped IndexedSlices.
    if axis_static == 0:
        if context.in_eager_mode():
            params_tail_shape = params_shape.as_cpu_tensor()[1:]
        else:
            params_tail_shape = params_shape[1:]
        values_shape = array_ops.concat([indices_size, params_tail_shape], 0)
        values = array_ops.reshape(grad, values_shape)
        indices = array_ops.reshape(indices, indices_size)
        return [ops.IndexedSlices(values, indices, params_shape), None, None]

    outer_shape = params_shape[:axis]
    outer_dims = array_ops.size(outer_shape)
    inner_shape = params_shape[axis:][1:]
    inner_dims = array_ops.size(inner_shape)

    outer_axes_indices = math_ops.range(outer_dims)
    inner_axes_indices = math_ops.range(outer_dims + 1,
                                        outer_dims + 1 + inner_dims)

    values_shape = array_ops.concat([outer_shape, indices_size, inner_shape],
                                    0)
    values = array_ops.reshape(grad, values_shape)
    indices = array_ops.reshape(indices, indices_size)

    # We need to sum up every slice `values[..., i, ....]` corresponding to
    # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
    # support an axis parameter, we transpose the gather dimension to the front,
    # then use `unsorted_segment_sum` to build a
    # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
    # affecting each index in `gather_axis` summed up.
    transpose_dims = array_ops.concat(
        [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
    values_transpose = array_ops.transpose(values, transpose_dims)
    num_segments = params_shape[axis]

    params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
                                                num_segments)

    # Inverts the above transpose by moving dimension 0 back to its original
    # position.
    invert_transpose_dims = array_ops.concat(
        [outer_axes_indices + 1, [0], inner_axes_indices], 0)
    params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
    return [params_grad, None, None]
Beispiel #36
0
    def __init__(self,
                 dist,
                 cat,
                 validate_args=False,
                 allow_nan_stats=True,
                 name="Categorized"):
        """Construct categorized distributions.

    Args:
      dist: A 'Distribution' instance.
      cat: A `Categorical` instance.
      validate_args: `Boolean`, default `False`.  Whether to assert that
        `lambd > 0` as well as inputs to pmf computations are non-negative
        integers. If validate_args is `False`, then `pmf` computations might
        return `NaN`, but can be evaluated at any real value.
      allow_nan_stats: `Boolean`, default `True`.  If `False`, raise an
        exception if a statistic (e.g. mean/mode/etc...) is undefined for any
        batch member.  If `True`, batch members with valid parameters leading to
        undefined statistics will return NaN for this statistic.
      name: A name for this distribution.
    """
        parameters = locals()
        parameters.pop("self")

        if not isinstance(dist, distribution.Distribution):
            raise TypeError("dist must be Distribution instance"
                            " but saw: %s" % dist)

        if not isinstance(cat, categorical.Categorical):
            raise TypeError(
                "cat must be a Categorical distribution, but saw: %s" % cat)
        is_continuous = dist.is_continuous

        static_event_shape = dist.get_event_shape()
        static_batch_shape = cat.get_batch_shape()
        static_batch_shape = static_batch_shape.merge_with(
            dist.get_batch_shape())

        if static_event_shape.ndims is None:
            raise ValueError(
                "Expected to know rank(event_shape) from distribution, but "
                "it does not provide a static number of ndims")

        with ops.name_scope(name,
                            values=cat._graph_parents +
                            dist._graph_parents) as ns:

            cat_batch_shape = cat.batch_shape()
            cat_batch_rank = array_ops.size(cat_batch_shape)
            if validate_args:
                batch_shapes = dist.batch_shape()
                batch_ranks = [array_ops.size(bs) for bs in batch_shapes]
                check_message = ("components[%d] batch shape must match cat "
                                 "batch shape")
                self._assertions = [
                    check_ops.assert_equal(cat_batch_rank,
                                           batch_ranks[di],
                                           message=check_message % di)
                    for di in range(len(components))
                ]
                self._assertions += [
                    check_ops.assert_equal(cat_batch_shape,
                                           batch_shapes[di],
                                           message=check_message % di)
                    for di in range(len(components))
                ]
            else:
                self._assertions = []

            self._dist = dist
            self._cat = cat
            self._num_cat_classes = math_ops.cast(cat.num_classes - 1,
                                                  dtypes.float32)
            self._static_event_shape = static_event_shape
            self._static_batch_shape = static_batch_shape

        graph_parents = self._cat._graph_parents
        graph_parents += self._dist._graph_parents

        super(Categorized, self).__init__(dtype=dist.dtype,
                                          is_continuous=is_continuous,
                                          is_reparameterized=False,
                                          validate_args=validate_args,
                                          allow_nan_stats=allow_nan_stats,
                                          parameters=parameters,
                                          graph_parents=graph_parents,
                                          name=ns)
Beispiel #37
0
def update_medoid_per_cluster(
    pairwise_distances,
    pairwise_distances_subset,
    labels,
    chosen_ids,
    cluster_member_ids,
    cluster_idx,
    margin_multiplier,
    margin_type,
):
    """Updates the cluster medoid per cluster.

  Args:
    pairwise_distances: 2-D Tensor of pairwise distances.
    pairwise_distances_subset: 2-D Tensor of pairwise distances for one cluster.
    labels: 1-D Tensor of ground truth cluster assignment.
    chosen_ids: 1-D Tensor of cluster centroid indices.
    cluster_member_ids: 1-D Tensor of cluster member indices for one cluster.
    cluster_idx: Index of this one cluster.
    margin_multiplier: multiplication constant.
    margin_type: Type of structured margin to use. Default is nmi.

  Returns:
    chosen_ids: Updated 1-D Tensor of cluster centroid indices.
  """
    def func_cond(iteration, scores_margin):
        del scores_margin  # Unused variable scores_margin.
        return iteration < num_candidates

    def func_body(iteration, scores_margin):
        # swap the current medoid with the candidate cluster member
        candidate_medoid = math_ops.to_int32(cluster_member_ids[iteration])
        tmp_chosen_ids = update_1d_tensor(chosen_ids, cluster_idx,
                                          candidate_medoid)
        predictions = get_cluster_assignment(pairwise_distances,
                                             tmp_chosen_ids)
        metric_score = compute_clustering_score(labels, predictions,
                                                margin_type)
        pad_before = array_ops.zeros([iteration])
        pad_after = array_ops.zeros([num_candidates - 1 - iteration])
        return (
            iteration + 1,
            scores_margin +
            array_ops.concat([pad_before, [1.0 - metric_score], pad_after], 0),
        )

    # pairwise_distances_subset is of size [p, 1, 1, p],
    #   the intermediate dummy dimensions at
    #   [1, 2] makes this code work in the edge case where p=1.
    #   this happens if the cluster size is one.
    scores_fac = -1.0 * math_ops.reduce_sum(
        array_ops.squeeze(pairwise_distances_subset, [1, 2]), axis=0)

    iteration = array_ops.constant(0)
    num_candidates = array_ops.size(cluster_member_ids)
    scores_margin = array_ops.zeros([num_candidates])

    _, scores_margin = control_flow_ops.while_loop(func_cond, func_body,
                                                   [iteration, scores_margin])
    candidate_scores = math_ops.add(scores_fac,
                                    margin_multiplier * scores_margin)

    argmax_index = math_ops.to_int32(
        math_ops.argmax(candidate_scores, dimension=0))

    best_medoid = math_ops.to_int32(cluster_member_ids[argmax_index])
    chosen_ids = update_1d_tensor(chosen_ids, cluster_idx, best_medoid)
    return chosen_ids
Beispiel #38
0
    def _sample_n(self, n, seed=None):
        with ops.control_dependencies(self._assertions):
            n = ops.convert_to_tensor(n, name="n")
            static_n = tensor_util.constant_value(n)
            n = int(static_n) if static_n is not None else n
            cat_samples = self.cat.sample(n, seed=seed)

            static_samples_shape = cat_samples.get_shape()
            if static_samples_shape.is_fully_defined():
                samples_shape = static_samples_shape.as_list()
                samples_size = static_samples_shape.num_elements()
            else:
                samples_shape = array_ops.shape(cat_samples)
                samples_size = array_ops.size(cat_samples)
            static_batch_shape = self.get_batch_shape()
            if static_batch_shape.is_fully_defined():
                batch_shape = static_batch_shape.as_list()
                batch_size = static_batch_shape.num_elements()
            else:
                batch_shape = self.batch_shape()
                batch_size = array_ops.reduce_prod(batch_shape)
            static_event_shape = self.get_event_shape()
            if static_event_shape.is_fully_defined():
                event_shape = np.array(static_event_shape.as_list(),
                                       dtype=np.int32)
            else:
                event_shape = self.event_shape()

            # Get indices into the raw cat sampling tensor.  We will
            # need these to stitch sample values back out after sampling
            # within the component partitions.
            samples_raw_indices = array_ops.reshape(
                math_ops.range(0, samples_size), samples_shape)

            # Partition the raw indices so that we can use
            # dynamic_stitch later to reconstruct the samples from the
            # known partitions.
            partitioned_samples_indices = data_flow_ops.dynamic_partition(
                data=samples_raw_indices,
                partitions=cat_samples,
                num_partitions=self.num_components)

            # Copy the batch indices n times, as we will need to know
            # these to pull out the appropriate rows within the
            # component partitions.
            batch_raw_indices = array_ops.reshape(
                array_ops.tile(math_ops.range(0, batch_size), [n]),
                samples_shape)

            # Explanation of the dynamic partitioning below:
            #   batch indices are i.e., [0, 1, 0, 1, 0, 1]
            # Suppose partitions are:
            #     [1 1 0 0 1 1]
            # After partitioning, batch indices are cut as:
            #     [batch_indices[x] for x in 2, 3]
            #     [batch_indices[x] for x in 0, 1, 4, 5]
            # i.e.
            #     [1 1] and [0 0 0 0]
            # Now we sample n=2 from part 0 and n=4 from part 1.
            # For part 0 we want samples from batch entries 1, 1 (samples 0, 1),
            # and for part 1 we want samples from batch entries 0, 0, 0, 0
            #   (samples 0, 1, 2, 3).
            partitioned_batch_indices = data_flow_ops.dynamic_partition(
                data=batch_raw_indices,
                partitions=cat_samples,
                num_partitions=self.num_components)
            samples_class = [None for _ in range(self.num_components)]

            for c in range(self.num_components):
                n_class = array_ops.size(partitioned_samples_indices[c])
                seed = distribution_util.gen_new_seed(seed, "mixture")
                samples_class_c = self.components[c].sample(n_class, seed=seed)

                # Pull out the correct batch entries from each index.
                # To do this, we may have to flatten the batch shape.

                # For sample s, batch element b of component c, we get the
                # partitioned batch indices from
                # partitioned_batch_indices[c]; and shift each element by
                # the sample index.  The final lookup can be thought of as
                # a matrix gather along locations (s, b) in
                # samples_class_c where the n_class rows correspond to
                # samples within this component and the batch_size columns
                # correspond to batch elements within the component.
                #
                # Thus the lookup index is
                #   lookup[c, i] = batch_size * s[i] + b[c, i]
                # for i = 0 ... n_class[c] - 1.
                lookup_partitioned_batch_indices = (
                    batch_size * math_ops.range(n_class) +
                    partitioned_batch_indices[c])
                samples_class_c = array_ops.reshape(
                    samples_class_c,
                    array_ops.concat(([n_class * batch_size], event_shape), 0))
                samples_class_c = array_ops.gather(
                    samples_class_c,
                    lookup_partitioned_batch_indices,
                    name="samples_class_c_gather")
                samples_class[c] = samples_class_c

            # Stitch back together the samples across the components.
            lhs_flat_ret = data_flow_ops.dynamic_stitch(
                indices=partitioned_samples_indices, data=samples_class)
            # Reshape back to proper sample, batch, and event shape.
            ret = array_ops.reshape(
                lhs_flat_ret,
                array_ops.concat((samples_shape, self.event_shape()), 0))
            ret.set_shape(
                tensor_shape.TensorShape(static_samples_shape).concatenate(
                    self.get_event_shape()))
            return ret
def lifted_struct_loss(labels, embeddings, margin=1.0):
    """Computes the lifted structured loss.

  The loss encourages the positive distances (between a pair of embeddings
  with the same labels) to be smaller than any negative distances (between a
  pair of embeddings with different labels) in the mini-batch in a way
  that is differentiable with respect to the embedding vectors.
  See: https://arxiv.org/abs/1511.06452.

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      multiclass integer labels.
    embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not
      be l2 normalized.
    margin: Float, margin term in the loss definition.

  Returns:
    lifted_loss: tf.float32 scalar.
  """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pairwise_distances = pairwise_distance(embeddings)

    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    diff = margin - pairwise_distances
    mask = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    # Safe maximum: Temporarily shift negative distances
    #   above zero before taking max.
    #     this is to take the max only among negatives.
    row_minimums = math_ops.reduce_min(diff, 1, keepdims=True)
    row_negative_maximums = math_ops.reduce_max(
        math_ops.multiply(diff - row_minimums,
                          mask), 1, keepdims=True) + row_minimums

    # Compute the loss.
    # Keep track of matrix of maximums where M_ij = max(m_i, m_j)
    #   where m_i is the max of alpha - negative D_i's.
    # This matches the Caffe loss layer implementation at:
    #   https://github.com/rksltnl/Caffe-Deep-Metric-Learning-CVPR16/blob/0efd7544a9846f58df923c8b992198ba5c355454/src/caffe/layers/lifted_struct_similarity_softmax_layer.cpp  # pylint: disable=line-too-long

    max_elements = math_ops.maximum(row_negative_maximums,
                                    array_ops.transpose(row_negative_maximums))
    diff_tiled = array_ops.tile(diff, [batch_size, 1])
    mask_tiled = array_ops.tile(mask, [batch_size, 1])
    max_elements_vect = array_ops.reshape(array_ops.transpose(max_elements),
                                          [-1, 1])

    loss_exp_left = array_ops.reshape(
        math_ops.reduce_sum(math_ops.multiply(
            math_ops.exp(diff_tiled - max_elements_vect), mask_tiled),
                            1,
                            keepdims=True), [batch_size, batch_size])

    loss_mat = max_elements + math_ops.log(loss_exp_left +
                                           array_ops.transpose(loss_exp_left))
    # Add the positive distance.
    loss_mat += pairwise_distances

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2.
    num_positives = math_ops.reduce_sum(mask_positives) / 2.0

    lifted_loss = math_ops.truediv(0.25 * math_ops.reduce_sum(
        math_ops.square(
            math_ops.maximum(math_ops.multiply(loss_mat, mask_positives),
                             0.0))),
                                   num_positives,
                                   name='liftedstruct_loss')
    return lifted_loss
def triplet_semihard_loss(labels, embeddings, margin=1.0):
    """Computes the triplet loss with semi-hard negative mining.

  The loss encourages the positive distances (between a pair of embeddings with
  the same labels) to be smaller than the minimum negative distance among
  which are at least greater than the positive distance plus the margin constant
  (called semi-hard negative) in the mini-batch. If no such negative exists,
  uses the largest negative distance instead.
  See: https://arxiv.org/abs/1503.03832.

  Args:
    labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
      multiclass integer labels.
    embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
      be l2 normalized.
    margin: Float, margin term in the loss definition.

  Returns:
    triplet_loss: tf.float32 scalar.
  """
    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    lshape = array_ops.shape(labels)
    assert lshape.shape == 1
    labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    batch_size = array_ops.size(labels)

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile,
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                1,
                                keepdims=True), 0.0), [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,
                                          negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    triplet_loss = math_ops.truediv(math_ops.reduce_sum(
        math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)),
                                    num_positives,
                                    name='triplet_semihard_loss')

    return triplet_loss
def hessians(ys,
             xs,
             name="hessians",
             colocate_gradients_with_ops=False,
             gate_gradients=False,
             aggregation_method=None):
    """Constructs the Hessian of sum of `ys` with respect to `x` in `xs`.

  `hessians()` adds ops to the graph to output the Hessian matrix of `ys`
  with respect to `xs`.  It returns a list of `Tensor` of length `len(xs)`
  where each tensor is the Hessian of `sum(ys)`.

  The Hessian is a matrix of second-order partial derivatives of a scalar
  tensor (see https://en.wikipedia.org/wiki/Hessian_matrix for more details).

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'hessians'.
    colocate_gradients_with_ops: See `gradients()` documentation for details.
    gate_gradients: See `gradients()` documentation for details.
    aggregation_method: See `gradients()` documentation for details.

  Returns:
    A list of Hessian matrices of `sum(ys)` for each `x` in `xs`.

  Raises:
    LookupError: if one of the operations between `xs` and `ys` does not
      have a registered gradient function.
  """
    xs = gradients_util._AsList(xs)  # pylint: disable=protected-access
    kwargs = {
        "colocate_gradients_with_ops": colocate_gradients_with_ops,
        "gate_gradients": gate_gradients,
        "aggregation_method": aggregation_method
    }
    # Compute first-order derivatives and iterate for each x in xs.
    hessians = []
    _gradients = gradients(ys, xs, **kwargs)
    for gradient, x in zip(_gradients, xs):
        # change shape to one-dimension without graph branching
        gradient = array_ops.reshape(gradient, [-1])

        # Declare an iterator and tensor array loop variables for the gradients.
        n = array_ops.size(x)
        loop_vars = [
            array_ops.constant(0, dtypes.int32),
            tensor_array_ops.TensorArray(x.dtype, n)
        ]
        # Iterate over all elements of the gradient and compute second order
        # derivatives.
        _, hessian = control_flow_ops.while_loop(
            lambda j, _: j < n, lambda j, result:
            (j + 1, result.write(j,
                                 gradients(gradient[j], x)[0])), loop_vars)

        _shape = array_ops.shape(x)
        _reshaped_hessian = array_ops.reshape(
            hessian.stack(), array_ops.concat((_shape, _shape), 0))
        hessians.append(_reshaped_hessian)
    return hessians
Beispiel #42
0
def _BatchNormGrad(grad_y,
                   x,
                   scale,
                   pop_mean,
                   pop_var,
                   epsilon,
                   data_format,
                   is_training=True):
    """Returns the gradients for the 3 inputs of BatchNorm.

  Args:
    grad_y: A `Tensor` of 4 or 5 dimensions for gradient for y.
    x: A `Tensor` of 4 or 5 dimensions for x.
    scale: A `Tensor` of 1 dimension for scaling.
    pop_mean: A `Tensor` of 1 dimension for the population mean. Only used when
      is_training=False.
    pop_var: A `Tensor` of 1 dimension for the population variance. Only used
      when is_training=False.
    epsilon: A small float number added to the variance of x.
    data_format: The data format for input. Either b"NHWC" or b"NCHW".
    is_training: A bool value to indicate the operation is for training
      (default) or inference.

  Returns:
    A tuple (grad_x, grad_scale, grad_offset), where grad_x is the gradient
    for x, grad_scale the gradient for scale, and grad_offset the gradient
    for offset.
  """
    x_dtype = x.dtype.base_dtype
    if x_dtype == dtypes.float16:
        # float16 math is too imprecise, so we do the batch norm gradient
        # computations in float32.
        x = math_ops.cast(x, dtypes.float32)
        grad_y = math_ops.cast(grad_y, dtypes.float32)
    if is_training:
        if data_format == b"NHWC":
            keepdims = False
            reduce_axis = [0, 1, 2]
        elif data_format == b"NDHWC":
            keepdims = False
            reduce_axis = [0, 1, 2, 3]
        elif data_format == b"NCHW":
            keepdims = True
            reduce_axis = [0, 2, 3]
            shape = [1, array_ops.size(scale), 1, 1]
            scale = array_ops.reshape(scale, shape)
        else:
            keepdims = True
            reduce_axis = [0, 2, 3, 4]
            shape = [1, array_ops.size(scale), 1, 1, 1]
            scale = array_ops.reshape(scale, shape)
        mean_grad_y = math_ops.reduce_mean(grad_y,
                                           reduce_axis,
                                           keepdims=keepdims)
        mean_x = math_ops.reduce_mean(x, reduce_axis, keepdims=keepdims)
        var_x = math_ops.reduce_mean(math_ops.squared_difference(
            x, array_ops.stop_gradient(mean_x)),
                                     reduce_axis,
                                     keepdims=keepdims)
        grad_y_offset = grad_y - mean_grad_y
        x_offset = x - mean_x
        mean = math_ops.reduce_mean(grad_y * x_offset,
                                    axis=reduce_axis,
                                    keepdims=keepdims)
        grad_x = scale * math_ops.rsqrt(var_x + epsilon) * (
            grad_y_offset -
            math_ops.reciprocal(var_x + epsilon) * mean * x_offset)
        grad_scale = math_ops.rsqrt(var_x + epsilon) * math_ops.reduce_sum(
            grad_y * x_offset, axis=reduce_axis, keepdims=keepdims)
        if data_format == b"NCHW" or data_format == b"NCDHW":
            grad_scale = array_ops.squeeze(grad_scale)
        grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
        return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
    else:
        if data_format == b"NHWC":
            reduce_axis = [0, 1, 2]
        elif data_format == b"NDHWC":
            reduce_axis = [0, 1, 2, 3]
        elif data_format == b"NCHW":
            reduce_axis = [0, 2, 3]
            shape = [1, array_ops.size(pop_mean), 1, 1]
            pop_mean = array_ops.reshape(pop_mean, shape)
            pop_var = array_ops.reshape(pop_var, shape)
            scale = array_ops.reshape(scale, shape)
        else:
            reduce_axis = [0, 2, 3, 4]
            shape = [1, array_ops.size(pop_mean), 1, 1, 1]
            pop_mean = array_ops.reshape(pop_mean, shape)
            pop_var = array_ops.reshape(pop_var, shape)
            scale = array_ops.reshape(scale, shape)

        grad_offset = math_ops.reduce_sum(grad_y, axis=reduce_axis)
        var_rsqrt = math_ops.rsqrt(pop_var + epsilon)
        grad_scale = math_ops.reduce_sum(grad_y * (x - pop_mean) * var_rsqrt,
                                         axis=reduce_axis)
        grad_x = grad_y * scale * var_rsqrt
        return math_ops.cast(grad_x, x_dtype), grad_scale, grad_offset
Beispiel #43
0
def _IFFT3DGrad(_, grad):
    rsize = 1. / math_ops.cast(array_ops.size(grad), dtypes.float32)
    return math_ops.fft3d(grad) * math_ops.complex(rsize, 0.)
Beispiel #44
0
    def _fused_batch_norm(self, inputs, training):
        """Returns the output of fused batch norm."""
        beta = self.beta if self.center else self._beta_const
        gamma = self.gamma if self.scale else self._gamma_const

        # TODO(b/129279393): Support zero batch input in non DistributionStrategy
        # code as well.
        if distribution_strategy_context.has_strategy(
        ) and not inputs.shape.is_fully_defined():
            inputs_size = array_ops.size(inputs)
        else:
            inputs_size = None

        def _fused_batch_norm_training():
            return nn.fused_batch_norm(inputs,
                                       gamma,
                                       beta,
                                       epsilon=self.epsilon,
                                       data_format=self._data_format)

        def _fused_batch_norm_inference():
            return nn.fused_batch_norm(inputs,
                                       gamma,
                                       beta,
                                       mean=self.moving_mean,
                                       variance=self.moving_variance,
                                       epsilon=self.epsilon,
                                       is_training=False,
                                       data_format=self._data_format)

        output, mean, variance = tf_utils.smart_cond(
            training, _fused_batch_norm_training, _fused_batch_norm_inference)
        if not self._bessels_correction_test_only:
            # Remove Bessel's correction to be consistent with non-fused batch norm.
            # Note that the variance computed by fused batch norm is
            # with Bessel's correction.
            sample_size = math_ops.cast(
                array_ops.size(inputs) / array_ops.size(variance),
                variance.dtype)
            factor = (sample_size -
                      math_ops.cast(1.0, variance.dtype)) / sample_size
            variance *= factor

        training_value = tf_utils.constant_value(training)
        if training_value is None:
            momentum = tf_utils.smart_cond(training, lambda: self.momentum,
                                           lambda: 1.0)
        else:
            momentum = ops.convert_to_tensor(self.momentum)
        if training_value or training_value is None:
            if distribution_strategy_context.in_cross_replica_context():
                strategy = distribution_strategy_context.get_strategy()
                mean_update = strategy.extended.update(
                    self.moving_mean, self._assign_moving_average,
                    (mean, self.momentum, inputs_size))
                variance_update = strategy.extended.update(
                    self.moving_variance, self._assign_moving_average,
                    (variance, self.momentum, inputs_size))
            else:
                mean_update = self._assign_moving_average(
                    self.moving_mean, mean, momentum, inputs_size)
                variance_update = self._assign_moving_average(
                    self.moving_variance, variance, momentum, inputs_size)
            self.add_update(mean_update, inputs=True)
            self.add_update(variance_update, inputs=True)

        return output
Beispiel #45
0
def average(a, axis=None, weights=None, returned=False):  # pylint: disable=missing-docstring
    if axis is not None and not isinstance(axis, six.integer_types):
        # TODO(wangpeng): Support tuple of ints as `axis`
        raise ValueError('`axis` must be an integer. Tuple of ints is not '
                         'supported yet. Got type: %s' % type(axis))
    a = np_array_ops.array(a)
    if weights is None:  # Treat all weights as 1
        if not np.issubdtype(a.dtype, np.inexact):
            a = a.astype(
                np_utils.result_type(a.dtype, np_dtypes.default_float_type()))
        avg = math_ops.reduce_mean(a.data, axis=axis)
        if returned:
            if axis is None:
                weights_sum = array_ops.size(a.data)
            else:
                weights_sum = array_ops.shape(a.data)[axis]
            weights_sum = math_ops.cast(weights_sum, a.data.dtype)
    else:
        if np.issubdtype(a.dtype, np.inexact):
            out_dtype = np_utils.result_type(a.dtype, weights)
        else:
            out_dtype = np_utils.result_type(a.dtype, weights,
                                             np_dtypes.default_float_type())
        a = np_array_ops.array(a, out_dtype).data
        weights = np_array_ops.array(weights, out_dtype).data

        def rank_equal_case():
            control_flow_ops.Assert(
                math_ops.reduce_all(
                    array_ops.shape(a) == array_ops.shape(weights)),
                [array_ops.shape(a),
                 array_ops.shape(weights)])
            weights_sum = math_ops.reduce_sum(weights, axis=axis)
            avg = math_ops.reduce_sum(a * weights, axis=axis) / weights_sum
            return avg, weights_sum

        if axis is None:
            avg, weights_sum = rank_equal_case()
        else:

            def rank_not_equal_case():
                control_flow_ops.Assert(
                    array_ops.rank(weights) == 1, [array_ops.rank(weights)])
                weights_sum = math_ops.reduce_sum(weights)
                axes = ops.convert_to_tensor([[axis], [0]])
                avg = math_ops.tensordot(a, weights, axes) / weights_sum
                return avg, weights_sum

            # We condition on rank rather than shape equality, because if we do the
            # latter, when the shapes are partially unknown but the ranks are known
            # and different, np_utils.cond will run shape checking on the true branch,
            # which will raise a shape-checking error.
            avg, weights_sum = np_utils.cond(
                math_ops.equal(array_ops.rank(a), array_ops.rank(weights)),
                rank_equal_case, rank_not_equal_case)

    avg = np_array_ops.array(avg)
    if returned:
        weights_sum = np_array_ops.broadcast_to(weights_sum,
                                                array_ops.shape(avg.data))
        return avg, weights_sum
    return avg
Beispiel #46
0
    def call(self, inputs, training=None):
        if training is None:
            training = K.learning_phase()

        in_eager_mode = context.executing_eagerly()
        if self.virtual_batch_size is not None:
            # Virtual batches (aka ghost batches) can be simulated by reshaping the
            # Tensor and reusing the existing batch norm implementation
            original_shape = [-1] + inputs.shape.as_list()[1:]
            expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

            # Will cause errors if virtual_batch_size does not divide the batch size
            inputs = array_ops.reshape(inputs, expanded_shape)

            def undo_virtual_batching(outputs):
                outputs = array_ops.reshape(outputs, original_shape)
                return outputs

        if self.fused:
            outputs = self._fused_batch_norm(inputs, training=training)
            if self.virtual_batch_size is not None:
                # Currently never reaches here since fused_batch_norm does not support
                # virtual batching
                outputs = undo_virtual_batching(outputs)
            return outputs

        # Compute the axes along which to reduce the mean / variance
        input_shape = inputs.shape
        ndims = len(input_shape)
        reduction_axes = [i for i in range(ndims) if i not in self.axis]
        if self.virtual_batch_size is not None:
            del reduction_axes[1]  # Do not reduce along virtual batch dim

        # Broadcasting only necessary for single-axis batch norm where the axis is
        # not the last dimension
        broadcast_shape = [1] * ndims
        broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value

        def _broadcast(v):
            if (v is not None and len(v.shape) != ndims
                    and reduction_axes != list(range(ndims - 1))):
                return array_ops.reshape(v, broadcast_shape)
            return v

        scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

        def _compose_transforms(scale, offset, then_scale, then_offset):
            if then_scale is not None:
                scale *= then_scale
                offset *= then_scale
            if then_offset is not None:
                offset += then_offset
            return (scale, offset)

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = tf_utils.constant_value(training)
        if training_value is not False:
            if self.adjustment:
                adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
                # Adjust only during training.
                adj_scale = tf_utils.smart_cond(
                    training, lambda: adj_scale,
                    lambda: array_ops.ones_like(adj_scale))
                adj_bias = tf_utils.smart_cond(
                    training, lambda: adj_bias,
                    lambda: array_ops.zeros_like(adj_bias))
                scale, offset = _compose_transforms(adj_scale, adj_bias, scale,
                                                    offset)

            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            keep_dims = self.virtual_batch_size is not None or len(
                self.axis) > 1
            mean, variance = self._moments(math_ops.cast(
                inputs, self._param_dtype),
                                           reduction_axes,
                                           keep_dims=keep_dims)

            moving_mean = self.moving_mean
            moving_variance = self.moving_variance

            mean = tf_utils.smart_cond(training, lambda: mean,
                                       lambda: moving_mean)
            variance = tf_utils.smart_cond(training, lambda: variance,
                                           lambda: moving_variance)

            if self.virtual_batch_size is not None:
                # This isn't strictly correct since in ghost batch norm, you are
                # supposed to sequentially update the moving_mean and moving_variance
                # with each sub-batch. However, since the moving statistics are only
                # used during evaluation, it is more efficient to just update in one
                # step and should not make a significant difference in the result.
                new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
                new_variance = math_ops.reduce_mean(variance,
                                                    axis=1,
                                                    keepdims=True)
            else:
                new_mean, new_variance = mean, variance

            inputs_size = array_ops.size(inputs)
            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    new_mean, new_variance, training, inputs_size)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
                d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
                scale, offset = _compose_transforms(r, d, scale, offset)

            if distribution_strategy_context.in_cross_replica_context():
                strategy = distribution_strategy_context.get_strategy()

                def _do_update(var, value):
                    """Compute the updates for mean and variance."""
                    if in_eager_mode and not self.trainable:
                        return
                    return strategy.extended.update(
                        var,
                        self._assign_moving_average,
                        (value, self.momentum, inputs_size),
                        group=False)

                # We need to unwrap the moving_mean or moving_variance in the case of
                # training being false to match the output of true_fn and false_fn
                # in the smart cond.
                mean_update = tf_utils.smart_cond(
                    training, lambda: _do_update(self.moving_mean, new_mean),
                    lambda: strategy.unwrap(self.moving_mean))
                variance_update = tf_utils.smart_cond(
                    training,
                    lambda: _do_update(self.moving_variance, new_variance),
                    lambda: strategy.unwrap(self.moving_variance))
            else:

                def _do_update(var, value):
                    """Compute the updates for mean and variance."""
                    if in_eager_mode and not self.trainable:
                        return
                    return self._assign_moving_average(var, value,
                                                       self.momentum,
                                                       inputs_size)

                mean_update = tf_utils.smart_cond(
                    training, lambda: _do_update(self.moving_mean, new_mean),
                    lambda: self.moving_mean)
                variance_update = tf_utils.smart_cond(
                    training,
                    lambda: _do_update(self.moving_variance, new_variance),
                    lambda: self.moving_variance)
            if not context.executing_eagerly():
                self.add_update(mean_update, inputs=True)
                self.add_update(variance_update, inputs=True)

        else:
            mean, variance = self.moving_mean, self.moving_variance

        mean = math_ops.cast(mean, inputs.dtype)
        variance = math_ops.cast(variance, inputs.dtype)
        if offset is not None:
            offset = math_ops.cast(offset, inputs.dtype)
        if scale is not None:
            scale = math_ops.cast(scale, inputs.dtype)
        # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
        # math in float16 hurts validation accuracy of popular models like resnet.
        outputs = nn.batch_normalization(inputs, _broadcast(mean),
                                         _broadcast(variance), offset, scale,
                                         self.epsilon)
        # If some components of the shape got lost due to adjustments, fix that.
        outputs.set_shape(input_shape)

        if self.virtual_batch_size is not None:
            outputs = undo_virtual_batching(outputs)
        return outputs
def triplet_loss_adapted_from_tf(y_true, y_pred):
    del y_true
    margin = 1.
    labels = y_pred[:, :1]

    labels = tf.cast(labels, dtype='int32')

    embeddings = y_pred[:, 1:]

    ### Code from Tensorflow function [tf.contrib.losses.metric_learning.triplet_semihard_loss] starts here:

    # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor.
    # lshape=array_ops.shape(labels)
    # assert lshape.shape == 1
    # labels = array_ops.reshape(labels, [lshape[0], 1])

    # Build pairwise squared distance matrix.
    pdist_matrix = pairwise_distance(embeddings, squared=True)
    # Build pairwise binary adjacency matrix.
    adjacency = math_ops.equal(labels, array_ops.transpose(labels))
    # Invert so we can select negatives only.
    adjacency_not = math_ops.logical_not(adjacency)

    # global batch_size
    batch_size = array_ops.size(labels)  # was 'array_ops.size(labels)'

    # Compute the mask.
    pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1])
    mask = math_ops.logical_and(
        array_ops.tile(adjacency_not, [batch_size, 1]),
        math_ops.greater(
            pdist_matrix_tile,
            array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1])))
    mask_final = array_ops.reshape(
        math_ops.greater(
            math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32),
                                1,
                                keepdims=True), 0.0), [batch_size, batch_size])
    mask_final = array_ops.transpose(mask_final)

    adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32)
    mask = math_ops.cast(mask, dtype=dtypes.float32)

    # negatives_outside: smallest D_an where D_an > D_ap.
    negatives_outside = array_ops.reshape(
        masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
    negatives_outside = array_ops.transpose(negatives_outside)

    # negatives_inside: largest D_an.
    negatives_inside = array_ops.tile(
        masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
    semi_hard_negatives = array_ops.where(mask_final, negatives_outside,
                                          negatives_inside)

    loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives)

    mask_positives = math_ops.cast(adjacency,
                                   dtype=dtypes.float32) - array_ops.diag(
                                       array_ops.ones([batch_size]))

    # In lifted-struct, the authors multiply 0.5 for upper triangular
    #   in semihard, they take all positive pairs except the diagonal.
    num_positives = math_ops.reduce_sum(mask_positives)

    semi_hard_triplet_loss_distance = math_ops.truediv(
        math_ops.reduce_sum(
            math_ops.maximum(math_ops.multiply(loss_mat, mask_positives),
                             0.0)),
        num_positives,
        name='triplet_semihard_loss')

    ### Code from Tensorflow function semi-hard triplet loss ENDS here.
    return semi_hard_triplet_loss_distance
Beispiel #48
0
 def loop_fn(i):
     x_i = array_ops.gather(x, i)
     return array_ops.size(x_i), array_ops.size(x_i,
                                                out_type=dtypes.int64)
Beispiel #49
0
def _FFT3DGrad(_, grad):
    size = math_ops.cast(array_ops.size(grad), dtypes.float32)
    return math_ops.ifft3d(grad) * math_ops.complex(size, 0.)
Beispiel #50
0
  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
                              total_counts):
    """Creates an op for training for mini batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.
      total_counts: Tensor Ref of cluster counts.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
    update_ops = []
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
      with ops.colocate_with(inp, ignore_existing=True):
        assert total_counts is not None
        cluster_idx = array_ops.reshape(cluster_idx, [-1])
        # Dedupe the unique ids of cluster_centers being updated so that updates
        # can be locally aggregated.
        unique_ids, unique_idx = array_ops.unique(cluster_idx)
        num_unique_cluster_idx = array_ops.size(unique_ids)
        # Fetch the old values of counts and cluster_centers.
        with ops.colocate_with(total_counts, ignore_existing=True):
          old_counts = array_ops.gather(total_counts, unique_ids)
        # TODO(agarwal): This colocation seems to run into problems. Fix it.
        with ops.colocate_with(cluster_centers, ignore_existing=True):
          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
        # Locally aggregate the increment to counts.
        count_updates = math_ops.unsorted_segment_sum(
            array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
            unique_idx, num_unique_cluster_idx)
        # Locally compute the sum of inputs mapped to each id.
        # For a cluster with old cluster value x, old count n, and with data
        # d_1,...d_k newly assigned to it, we recompute the new value as
        # \\(x += (sum_i(d_i) - k * x) / (n + k)\\).
        # Compute \\(sum_i(d_i)\\), see comment above.
        cluster_center_updates = math_ops.unsorted_segment_sum(
            inp, unique_idx, num_unique_cluster_idx)
        # Shape to enable broadcasting count_updates and learning_rate to inp.
        # It extends the shape with 1's to match the rank of inp.
        broadcast_shape = array_ops.concat([
            array_ops.reshape(num_unique_cluster_idx, [1]),
            array_ops.ones(
                array_ops.reshape(array_ops.rank(inp) - 1, [1]),
                dtype=dtypes.int32)
        ], 0)
        # Subtract k * x, see comment above.
        cluster_center_updates -= math_ops.cast(
            array_ops.reshape(count_updates, broadcast_shape),
            inp.dtype) * old_cluster_centers
        learning_rate = math_ops.reciprocal(
            math_ops.cast(old_counts + count_updates, inp.dtype))
        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
        # scale by 1 / (n + k), see comment above.
        cluster_center_updates *= learning_rate
        # Apply the updates.
      update_counts = state_ops.scatter_add(total_counts, unique_ids,
                                            count_updates)
      update_cluster_centers = state_ops.scatter_add(
          cluster_centers, unique_ids, cluster_center_updates)
      update_ops.extend([update_counts, update_cluster_centers])
    return control_flow_ops.group(*update_ops)
 def testEmptyInput(self):
     x = array_ops.ones(shape=[0, 3], dtype=dtypes.float32)
     y = np.zeros(shape=[0, 3], dtype=np.float32)
     self.assertEqual(0, self.evaluate(array_ops.size(x)))
     self.assertAllEqual(y, self.evaluate(nn_ops.softmax(x, axis=0)))
def safe_embedding_lookup_sparse(
    embedding_weights,
    sparse_ids,
    sparse_weights=None,
    combiner="mean",
    default_id=None,
    name="safe_embedding_lookup_sparse",
    partition_strategy=None,  # no used
    max_norm=None,
    return_trainable=False,
):
    """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`.

    Lookup embedding results, accounting for empty features and invalid weights.

    Any IDs will be treated as valid include non-positive IDs.
    Invalid weights (<= 0) are pruned from input weights, as well as any IDs
    with non-positive weight. For an entry with no features, the embedding vector
    for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

    The ids and weights may be multi-dimensional. Embeddings are always aggregated
    along the last dimension.

    Args:
      embedding_weights: A single `dynamic_embedding.Variable` instance
        representing the complete embedding tensor.
      sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
      sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights are
        be assumed to be 1.0.
      combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
        default.
      default_id: The id to use for an entry with no features.
      name: A name for this operation (optional).
      partition_strategy: A string specifying the partitioning strategy. Currently
        `"div"` and `"mod"` are supported. Default is `"div"`.
      max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
        combining.

    Returns:
      combined_embeddings:
        A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.

    Raises:
      ValueError: if `embedding_weights` is empty.
    """
    if embedding_weights is None:
        raise ValueError("Missing embedding_weights %s." % embedding_weights)

    if embedding_weights.key_dtype != sparse_ids.dtype:
        raise TypeError(
            "embedding_weights.key_dtype should be same with sparse_ids.dtype: "
            "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype))

    weights_dtype = sparse_weights.dtype if sparse_weights is not None else None
    if weights_dtype and embedding_weights.value_dtype != weights_dtype:
        raise TypeError(
            "embedding_weights.value_dtype should be same with sparse_weights.dtype"
            ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype))

    scope = variable_scope.get_variable_scope()
    full_name = scope.name + "/" + name if scope.name else name
    with ops.name_scope(full_name + "/"):
        # Reshape higher-rank sparse ids and weights to linear segment ids.
        original_shape = sparse_ids.dense_shape
        original_rank_dim = tensor_shape.dimension_value(
            sparse_ids.dense_shape.get_shape()[0])
        original_rank = (array_ops.size(original_shape)
                         if original_rank_dim is None else original_rank_dim)
        sparse_ids = sparse_ops.sparse_reshape(
            sparse_ids,
            [
                math_ops.reduce_prod(
                    array_ops.slice(original_shape, [0], [original_rank - 1])),
                array_ops.gather(original_shape, original_rank - 1),
            ],
        )
        if sparse_weights is not None:
            sparse_weights = sparse_tensor.SparseTensor(
                sparse_ids.indices, sparse_weights.values,
                sparse_ids.dense_shape)

        # Prune invalid weights.
        if combiner != "sum":
            sparse_ids, sparse_weights = _prune_invalid_weights(
                sparse_ids, sparse_weights)

        # Fill in dummy values for empty features, if necessary.
        sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(
            sparse_ids, default_id or 0)
        if sparse_weights is not None:
            sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(
                sparse_weights, 1.0)

        result, trainable_ = embedding_lookup_sparse(
            embedding_weights,
            sparse_ids,
            sparse_weights,
            combiner=combiner,
            partition_strategy=partition_strategy,
            name=name + "/embedding_lookup_sparse",
            max_norm=max_norm,
            return_trainable=True,
        )

        if default_id is None:
            # Broadcast is_row_empty to the same shape as embedding_lookup_result,
            # for use in Select.
            is_row_empty = array_ops.tile(
                array_ops.reshape(is_row_empty, [-1, 1]),
                array_ops.stack([1, array_ops.shape(result)[1]]),
            )

            result = array_ops.where(is_row_empty,
                                     array_ops.zeros_like(result),
                                     result,
                                     name="where")

        # Reshape back from linear ids back into higher-dimensional dense result.
        final_result = array_ops.reshape(
            result,
            array_ops.concat(
                [
                    array_ops.slice(
                        math_ops.cast(original_shape, dtypes.int32),
                        [0],
                        [original_rank - 1],
                    ),
                    array_ops.slice(array_ops.shape(result), [1], [-1]),
                ],
                0,
            ),
        )
        final_result.set_shape(
            tensor_shape.unknown_shape(
                (tensor_shape.Dimension(original_rank_dim) -
                 1).value).concatenate(result.get_shape()[1:]))
        return (final_result, trainable_) if return_trainable else final_result
Beispiel #53
0
    def batch_jacobian(self,
                       target,
                       source,
                       unconnected_gradients=UnconnectedGradients.NONE,
                       parallel_iterations=None,
                       experimental_use_pfor=True):
        """Computes and stacks per-example jacobians.

    See [wikipedia article](http://en.wikipedia.org/wiki/jacobian_matrix_and_determinant) for the
    definition of a Jacobian. This function is essentially an efficient
    implementation of the following:

    `tf.stack([self.jacobian(y[i], x[i]) for i in range(x.shape[0])])`.

    Note that compared to `GradientTape.jacobian` which computes gradient of
    each output value w.r.t each input value, this function is useful when
    `target[i,...]` is independent of `source[j,...]` for `j != i`. This
    assumption allows more efficient computation as compared to
    `GradientTape.jacobian`. The output, as well as intermediate activations,
    are lower dimensional and avoid a bunch of redundant zeros which would
    result in the jacobian computation given the independence assumption.

    Example usage:

    ```python
    with tf.GradientTape() as g:
      x = tf.constant([[1., 2.], [3., 4.]], dtype=tf.float32)
      g.watch(x)
      y = x * x
    batch_jacobian = g.batch_jacobian(y, x)
    # batch_jacobian is [[[2,  0], [0,  4]], [[6,  0], [0,  8]]]
    ```

    Args:
      target: A tensor with rank 2 or higher and with shape [b, y1, ..., y_n].
        `target[i,...]` should only depend on `source[i,...]`.
      source: A tensor with rank 2 or higher and with shape [b, x1, ..., x_m].
      unconnected_gradients: a value which can either hold 'none' or 'zero' and
        alters the value which will be returned if the target and sources are
        unconnected. The possible values and effects are detailed in
        'UnconnectedGradients' and it defaults to 'none'.
      parallel_iterations: A knob to control how many iterations are dispatched
        in parallel. This knob can be used to control the total memory usage.
      experimental_use_pfor: If true, uses pfor for computing the Jacobian. Else
        uses a tf.while_loop.

    Returns:
      A tensor `t` with shape [b, y_1, ..., y_n, x1, ..., x_m] where `t[i, ...]`
      is the jacobian of `target[i, ...]` w.r.t. `source[i, ...]`, i.e. stacked
      per-example jacobians.

    Raises:
      RuntimeError: If called on a non-persistent tape with eager execution
        enabled and without enabling experimental_use_pfor.
      ValueError: If vectorization of jacobian computation fails or if first
        dimension of `target` and `source` do not match.
    """
        target_shape = target.shape
        if target_shape.rank is None:
            dim = tensor_shape.Dimension(None)
        else:
            dim = target_shape.dims[0]
        if not (target_shape.with_rank_at_least(2)
                and source.shape.with_rank_at_least(2)
                and dim.is_compatible_with(source.shape[0])):
            raise ValueError("Need first dimension of target shape (%s) and "
                             "source shape (%s) to match." %
                             (target.shape, source.shape))
        if target_shape.is_fully_defined():
            batch_size = int(target_shape[0])
            target_row_size = target_shape.num_elements() // batch_size
        else:
            target_shape = array_ops.shape(target)
            batch_size = target_shape[0]
            target_row_size = array_ops.size(target) // batch_size
        source_shape = array_ops.shape(source)
        # Flatten target to 2-D.
        # Note that we push and pop the tape here and below. This is needed since we
        # need gradients through the enclosed operations.
        self._push_tape()
        with ops.control_dependencies(
            [check_ops.assert_equal(batch_size, source_shape[0])]):
            target = array_ops.reshape(target, [batch_size, target_row_size])
        self._pop_tape()

        def loop_fn(i):
            self._push_tape()
            y = array_ops.gather(target, i, axis=1)
            self._pop_tape()
            return self.gradient(y,
                                 source,
                                 unconnected_gradients=unconnected_gradients)

        if experimental_use_pfor:
            try:
                output = pfor_ops.pfor(loop_fn,
                                       target_row_size,
                                       parallel_iterations=parallel_iterations)
            except ValueError as err:
                six.reraise(
                    ValueError,
                    ValueError(
                        str(err) +
                        "\nEncountered an exception while vectorizing the "
                        "batch_jacobian computation. Vectorization can be disabled by "
                        "setting experimental_use_pfor to False."),
                    sys.exc_info()[2])
        else:
            if context.executing_eagerly() and not self._persistent:
                raise RuntimeError(
                    "GradientTape must be created with persistent=True"
                    " to compute the batch_jacobian with eager execution enabled and "
                    " with experimental_use_pfor set to False.")
            output = pfor_ops.for_loop(loop_fn,
                                       target.dtype,
                                       target_row_size,
                                       parallel_iterations=parallel_iterations)
        new_shape = array_ops.concat([target_shape, source_shape[1:]], axis=0)
        if output is None:
            return array_ops.zeros(new_shape)
        else:
            output = array_ops.reshape(output,
                                       [target_row_size, batch_size, -1])
            output = array_ops.transpose(output, [1, 0, 2])
            return array_ops.reshape(output, new_shape)
Beispiel #54
0
def embedding_lookup(params, ids, name=None):
    """Return a tensor of embedding values by looking up "ids" in "params".

  Args:
    params: List of tensors of the same shape.  A single tensor is
            treated as a singleton list.
    ids: Tensor of integers containing the ids to be looked up in
         'params'.  Let P be len(params).  If P > 1, then the ids are
         partitioned by id % P, and we do separate lookups in params[p]
         for 0 <= p < P, and then stitch the results back together into
         a single result tensor.
    name: Optional name for the op.

  Returns:
    A tensor of shape ids.shape + params[0].shape[1:] containing the
    values params[i % P][i] for each i in ids.

  Raises:
    ValueError: if some parameters are invalid.
  """
    if not isinstance(params, list):
        params = [params]
    with ops.op_scope(params + [ids], name, "embedding_lookup") as name:
        if not params:
            raise ValueError("Need at least one param")
        np = len(params)  # Number of partitions
        params = ops.convert_n_to_tensor_or_indexed_slices(params,
                                                           name="params")
        if np == 1:
            with ops.device(params[0].device):
                return array_ops.gather(params[0], ids, name=name)
        else:
            ids = ops.convert_to_tensor(ids, name="ids")
            flat_ids = array_ops.reshape(ids, [-1])
            original_indices = math_ops.range(0, array_ops.size(flat_ids))
            # Compute flat_ids % partitions for each id
            ids_mod_p = flat_ids % np
            if ids_mod_p.dtype != types.int32:
                ids_mod_p = math_ops.cast(ids_mod_p, types.int32)
            # Partition single list of ids based on ids % np into np separate lists
            plist = data_flow_ops.dynamic_partition(flat_ids, ids_mod_p, np)
            # Similarly, partition the original indices.
            pindices = data_flow_ops.dynamic_partition(original_indices,
                                                       ids_mod_p, np)
            # Do np separate lookups, finding embeddings for plist[p] in params[p]
            partitioned_result = []
            for p in range(np):
                # TODO(agarwal): handle device allocations here and later in the
                # colocate code.
                gather_ids = plist[p] / np
                with ops.device(params[p].device):
                    partitioned_result.append(
                        array_ops.gather(params[p], gather_ids))
            # Stitch these back together
            ret = data_flow_ops.dynamic_stitch(pindices,
                                               partitioned_result,
                                               name=name)
            # Reshape to reverse the flattening of ids.
            # It's important that we compute params[0].shape on the right device
            # to avoid data motion.
            with ops.device(params[0].device):
                params_shape = array_ops.shape(params[0])
            ret = array_ops.reshape(
                ret,
                array_ops.concat(0, [
                    array_ops.shape(ids),
                    array_ops.slice(params_shape, [1], [-1])
                ]))
            # output shape = ids.shape + params[*].shape[1:]
            # Normally the reshape is sufficient, but setting shape explicitly
            # teaches shape inference that params[1:].get_shape() matters.
            element_shape = params[0].get_shape()[1:]
            for p in params[1:]:
                element_shape = element_shape.merge_with(p.get_shape()[1:])
            ret.set_shape(ids.get_shape().concatenate(element_shape))
            return ret
Beispiel #55
0
  def _fused_batch_norm(self, inputs, training):
    """Returns the output of fused batch norm."""
    beta = self.beta if self.center else self._beta_const
    gamma = self.gamma if self.scale else self._gamma_const

    # TODO(b/129279393): Support zero batch input in non DistributionStrategy
    # code as well.
    if self._support_zero_size_input():
      inputs_size = array_ops.size(inputs)
    else:
      inputs_size = None

    def _fused_batch_norm_training():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          epsilon=self.epsilon,
          data_format=self._data_format)

    def _fused_batch_norm_inference():
      return nn.fused_batch_norm(
          inputs,
          gamma,
          beta,
          mean=self.moving_mean,
          variance=self.moving_variance,
          epsilon=self.epsilon,
          is_training=False,
          data_format=self._data_format)

    output, mean, variance = tf_utils.smart_cond(
        training, _fused_batch_norm_training, _fused_batch_norm_inference)
    if not self._bessels_correction_test_only:
      # Remove Bessel's correction to be consistent with non-fused batch norm.
      # Note that the variance computed by fused batch norm is
      # with Bessel's correction.
      sample_size = math_ops.cast(
          array_ops.size(inputs) / array_ops.size(variance), variance.dtype)
      factor = (sample_size - math_ops.cast(1.0, variance.dtype)) / sample_size
      variance *= factor

    training_value = tf_utils.constant_value(training)
    if training_value is None:
      momentum = tf_utils.smart_cond(training,
                                     lambda: self.momentum,
                                     lambda: 1.0)
    else:
      momentum = ops.convert_to_tensor(self.momentum)
    if training_value or training_value is None:
      def mean_update():
        return self._assign_moving_average(self.moving_mean, mean, momentum,
                                           inputs_size)

      def variance_update():
        """Update self.moving_variance with the most recent data point."""
        if self.renorm:
          # We apply epsilon as part of the moving_stddev to mirror the training
          # code path.
          moving_stddev = self._assign_moving_average(
              self.moving_stddev, math_ops.sqrt(variance + self.epsilon),
              momentum, inputs_size)
          return self._assign_new_value(
              self.moving_variance,
              # Apply relu in case floating point rounding causes it to go
              # negative.
              K.relu(moving_stddev * moving_stddev - self.epsilon))
        else:
          return self._assign_moving_average(self.moving_variance, variance,
                                             momentum, inputs_size)

      self.add_update(mean_update)
      self.add_update(variance_update)

    return output
Beispiel #56
0
def _embedding_lookup_and_transform(params,
                                    ids,
                                    partition_strategy="mod",
                                    name=None,
                                    max_norm=None,
                                    transform_fn=None):
  """Helper function for embedding_lookup and _compute_sampled_logits.

  This function is a generalization of embedding_lookup that optionally
  applies a caller-specified transformation to each embedding. This is
  done through the `transform_fn` argument. If provided, the function is
  applied to each partitioned tensor of retrieved embeddings, colocated
  with the embeddings. This function will be called with a single `Tensor`
  argument of the same type as the `params` tensor and should return a
  `Tensor`. The shape of the argument will be the same as `params` except
  for the size of the first dimension. The first dimension of the result's
  shape must be the same size as the argument's.

  Args:
    params: See embedding_lookup.
    ids: See embedding_lookup.
    partition_strategy: See embedding_lookup.
    name: See embedding_lookup.
    max_norm: See embedding_lookup.
    transform_fn: An optional function to apply to each retrieved embedding. If
      max_norm is provided, transform_fn is applied to the norm-limited
      embeddings.

  Returns:
    See embedding_lookup for details.
  Raises:
    ValueError: If `params` is empty.
  """
  if params is None:
    raise ValueError("params must be specified")
  if isinstance(params, (list, tuple)) and not params:
    raise ValueError("Need at least one param")
  if isinstance(params, variables.PartitionedVariable):
    params = list(params)  # Iterate to get the underlying Variables.
  if not isinstance(params, list):
    params = [params]

  with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
    np = len(params)  # Number of partitions
    # Preserve the resource variable status to avoid accidental dense reads.
    if not any(
        isinstance(p, resource_variable_ops.ResourceVariable) for p in params):
      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
    ids = ops.convert_to_tensor(ids, name="ids")
    if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
      with ops.colocate_with(params[0]):
        result = _clip(
            array_ops.gather(params[0], ids, name=name), ids, max_norm)
        if transform_fn:
          result = transform_fn(result)
      # Make sure the final result does not have colocation contraints on the
      # params. Similar to the case np > 1 where parallel_dynamic_stitch is
      # outside the scioe of all with ops.colocate_with(params[p]).
      return array_ops.identity(result)
    else:
      # Flatten the ids. There are two cases where we need to do this.
      # - There is more than one params tensor.
      # - There is a transform_fn and ids is not statically known to be 1-D.
      #   We must flatten in this case because transform_fn expects a flat
      #   tensor of embeddings.
      flat_ids = array_ops.reshape(ids, [-1])
      original_indices = math_ops.range(array_ops.size(flat_ids))

      # Create p_assignments and set new_ids depending on the strategy.
      if partition_strategy == "mod":
        p_assignments = flat_ids % np
        new_ids = flat_ids // np
      elif partition_strategy == "div":
        # Compute num_total_ids as the sum of dim-0 of params, then assign to
        # partitions based on a constant number of ids per partition. Optimize
        # if we already know the full shape statically.
        dim_0_size = tensor_shape.Dimension(
            tensor_shape.dimension_value(params[0].get_shape()[0]))
        for p in xrange(1, np):
          dim_0_size += tensor_shape.Dimension(
              tensor_shape.dimension_value(params[p].get_shape()[0]))
        if dim_0_size.value:
          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
        else:
          dim_0_sizes = []
          for p in xrange(np):
            param_p_dim = tensor_shape.dimension_value(params[p].get_shape()[0])
            if param_p_dim is not None:
              dim_0_sizes.append(param_p_dim)
            else:
              with ops.colocate_with(params[p]):
                dim_0_sizes.append(array_ops.shape(params[p])[0])
          num_total_ids = math_ops.reduce_sum(
              math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
        ids_per_partition = num_total_ids // np
        extras = num_total_ids % np

        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1),
                                         (flat_ids - extras) //
                                         ids_per_partition)

        # Emulate a conditional using a boolean indicator tensor
        new_ids = array_ops.where(p_assignments < extras,
                                  flat_ids % (ids_per_partition + 1),
                                  (flat_ids - extras) % ids_per_partition)
      else:
        raise ValueError("Unrecognized partition strategy: " +
                         partition_strategy)

      # Cast partition assignments to int32 for use in dynamic_partition.
      # There really should not be more than 2^32 partitions.
      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
      # Partition list of ids based on assignments into np separate lists
      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
      # Similarly, partition the original indices.
      pindices = data_flow_ops.dynamic_partition(original_indices,
                                                 p_assignments, np)
      # Do np separate lookups, finding embeddings for plist[p] in params[p]
      partitioned_result = []
      for p in xrange(np):
        pids = gather_ids[p]
        with ops.colocate_with(params[p]):
          result = array_ops.gather(params[p], pids)
          if transform_fn:
            # If transform_fn is provided, the clip_by_norm precedes
            # the transform and hence must be co-located. See below
            # for the counterpart if transform_fn is not proveded.
            result = transform_fn(_clip(result, pids, max_norm))
        partitioned_result.append(result)
      # Stitch these back together
      ret = data_flow_ops.parallel_dynamic_stitch(
          pindices, partitioned_result, name=name)

      # Determine the static element shape.
      if transform_fn is None:
        element_shape_s = params[0].get_shape()[1:]
        for p in params[1:]:
          element_shape_s = element_shape_s.merge_with(p.get_shape()[1:])
      else:
        element_shape_s = ret.get_shape()[1:]

      # Compute the dynamic element shape.
      if element_shape_s.is_fully_defined():
        element_shape_d = element_shape_s
      elif transform_fn is None:
        # It's important that we compute params[0].shape on the right device
        # to avoid data motion.
        with ops.colocate_with(params[0]):
          params_shape = array_ops.shape(params[0])
        element_shape_d = params_shape[1:]
      else:
        element_shape_d = array_ops.shape(ret)[1:]

      # Reshape to reverse the flattening of ids.
      ret = array_ops.reshape(
          ret, array_ops.concat([array_ops.shape(ids), element_shape_d], 0))

      # Normally the reshape is sufficient, but setting shape explicitly
      # teaches shape inference that params[1:].get_shape() matters
      # (in the case that transform_fn is None).
      ret.set_shape(ids.get_shape().concatenate(element_shape_s))
      if not transform_fn:
        # If transform_fn was provided, the clip_by_norm was done above.
        ret = _clip(ret, ids, max_norm)
      return ret
Beispiel #57
0
    def __init__(self,
                 cat,
                 components,
                 validate_args=False,
                 allow_nan_stats=True,
                 name="Mixture"):
        """Initialize a Mixture distribution.

    A `Mixture` is defined by a `Categorical` (`cat`, representing the
    mixture probabilities) and a list of `Distribution` objects
    all having matching dtype, batch shape, event shape, and continuity
    properties (the components).

    The `num_classes` of `cat` must be possible to infer at graph construction
    time and match `len(components)`.

    Args:
      cat: A `Categorical` distribution instance, representing the probabilities
          of `distributions`.
      components: A list or tuple of `Distribution` instances.
        Each instance must have the same type, be defined on the same domain,
        and have matching `event_shape` and `batch_shape`.
      validate_args: `Boolean`, default `False`.  If `True`, raise a runtime
        error if batch or event ranks are inconsistent between cat and any of
        the distributions.  This is only checked if the ranks cannot be
        determined statically at graph construction time.
      allow_nan_stats: Boolean, default `True`.  If `False`, raise an
       exception if a statistic (e.g. mean/mode/etc...) is undefined for any
        batch member.  If `True`, batch members with valid parameters leading to
        undefined statistics will return NaN for this statistic.
      name: A name for this distribution (optional).

    Raises:
      TypeError: If cat is not a `Categorical`, or `components` is not
        a list or tuple, or the elements of `components` are not
        instances of `Distribution`, or do not have matching `dtype`.
      ValueError: If `components` is an empty list or tuple, or its
        elements do not have a statically known event rank.
        If `cat.num_classes` cannot be inferred at graph creation time,
        or the constant value of `cat.num_classes` is not equal to
        `len(components)`, or all `components` and `cat` do not have
        matching static batch shapes, or all components do not
        have matching static event shapes.
    """
        parameters = locals()
        parameters.pop("self")
        if not isinstance(cat, categorical.Categorical):
            raise TypeError(
                "cat must be a Categorical distribution, but saw: %s" % cat)
        if not components:
            raise ValueError("components must be a non-empty list or tuple")
        if not isinstance(components, (list, tuple)):
            raise TypeError("components must be a list or tuple, but saw: %s" %
                            components)
        if not all(
                isinstance(c, distribution.Distribution) for c in components):
            raise TypeError(
                "all entries in components must be Distribution instances"
                " but saw: %s" % components)

        dtype = components[0].dtype
        if not all(d.dtype == dtype for d in components):
            raise TypeError("All components must have the same dtype, but saw "
                            "dtypes: %s" % [(d.name, d.dtype)
                                            for d in components])
        is_continuous = components[0].is_continuous
        if not all(d.is_continuous == is_continuous for d in components):
            raise TypeError(
                "All components must either be continuous or not, but continuity "
                "values are: %s" % [(d.name, d.is_continuous)
                                    for d in components])
        static_event_shape = components[0].get_event_shape()
        static_batch_shape = cat.get_batch_shape()
        for d in components:
            static_event_shape = static_event_shape.merge_with(
                d.get_event_shape())
            static_batch_shape = static_batch_shape.merge_with(
                d.get_batch_shape())
        if static_event_shape.ndims is None:
            raise ValueError(
                "Expected to know rank(event_shape) from components, but "
                "none of the components provide a static number of ndims")

        # Ensure that all batch and event ndims are consistent.
        with ops.name_scope(name, values=[cat.logits]) as ns:
            num_components = cat.num_classes
            static_num_components = tensor_util.constant_value(num_components)
            if static_num_components is None:
                raise ValueError(
                    "Could not infer number of classes from cat and unable "
                    "to compare this value to the number of components passed in."
                )
            # Possibly convert from numpy 0-D array.
            static_num_components = int(static_num_components)
            if static_num_components != len(components):
                raise ValueError(
                    "cat.num_classes != len(components): %d vs. %d" %
                    (static_num_components, len(components)))

            cat_batch_shape = cat.batch_shape()
            cat_batch_rank = array_ops.size(cat_batch_shape)
            if validate_args:
                batch_shapes = [d.batch_shape() for d in components]
                batch_ranks = [array_ops.size(bs) for bs in batch_shapes]
                check_message = ("components[%d] batch shape must match cat "
                                 "batch shape")
                self._assertions = [
                    check_ops.assert_equal(cat_batch_rank,
                                           batch_ranks[di],
                                           message=check_message % di)
                    for di in range(len(components))
                ]
                self._assertions += [
                    check_ops.assert_equal(cat_batch_shape,
                                           batch_shapes[di],
                                           message=check_message % di)
                    for di in range(len(components))
                ]
            else:
                self._assertions = []

            self._cat = cat
            self._components = list(components)
            self._num_components = static_num_components
            self._static_event_shape = static_event_shape
            self._static_batch_shape = static_batch_shape

        # We let the Mixture distribution access _graph_parents since its arguably
        # more like a baseclass.
        graph_parents = self._cat._graph_parents  # pylint: disable=protected-access
        for c in self._components:
            graph_parents += c._graph_parents  # pylint: disable=protected-access

        super(Mixture, self).__init__(dtype=dtype,
                                      is_reparameterized=False,
                                      is_continuous=is_continuous,
                                      validate_args=validate_args,
                                      allow_nan_stats=allow_nan_stats,
                                      parameters=parameters,
                                      graph_parents=graph_parents,
                                      name=ns)
Beispiel #58
0
def safe_embedding_lookup_sparse(embedding_weights,
                                 sparse_ids,
                                 sparse_weights=None,
                                 combiner="mean",
                                 default_id=None,
                                 name=None,
                                 partition_strategy="div",
                                 max_norm=None):
  """Lookup embedding results, accounting for invalid IDs and empty features.

  The partitioned embedding in `embedding_weights` must all be the same shape
  except for the first dimension. The first dimension is allowed to vary as the
  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
  may be a `PartitionedVariable` as returned by using
  `tf.compat.v1.get_variable()` with a
  partitioner.

  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
  with non-positive weight. For an entry with no features, the embedding vector
  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

  The ids and weights may be multi-dimensional. Embeddings are always aggregated
  along the last dimension.

  Args:
    embedding_weights:  A list of `P` float `Tensor`s or values representing
      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
      created by partitioning along dimension 0.  The total unpartitioned shape
      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
      and `e_1, ..., e_m` are the embedding dimensions.
    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
      ids. `d_0` is typically batch size.
    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
      float weights corresponding to `sparse_ids`, or `None` if all weights are
      be assumed to be 1.0.
    combiner: A string specifying how to combine embedding results for each
      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
      default.
    default_id: The id to use for an entry with no features.
    name: A name for this operation (optional).
    partition_strategy: A string specifying the partitioning strategy. Currently
      `"div"` and `"mod"` are supported. Default is `"div"`.
    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
      combining.

  Returns:
    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.

  Raises:
    ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)
  if isinstance(embedding_weights, variables.PartitionedVariable):
    embedding_weights = list(embedding_weights)  # get underlying Variables.
  if not isinstance(embedding_weights, list):
    embedding_weights = [embedding_weights]
  if len(embedding_weights) < 1:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  dtype = sparse_weights.dtype if sparse_weights is not None else None
  embedding_weights = [
      w if (isinstance(w, resource_variable_ops.ResourceVariable)
            and dtype in (None, w.dtype))
      else ops.convert_to_tensor(w, dtype=dtype)
      for w in embedding_weights
  ]

  with ops.name_scope(name, "embedding_lookup", embedding_weights +
                      [sparse_ids, sparse_weights]) as scope:
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.dense_shape
    original_rank_dim = tensor_shape.dimension_value(
        sparse_ids.dense_shape.get_shape()[0])
    original_rank = (
        array_ops.size(original_shape)
        if original_rank_dim is None else original_rank_dim)
    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
        math_ops.reduce_prod(
            array_ops.slice(original_shape, [0], [original_rank - 1])),
        array_ops.gather(original_shape, original_rank - 1)
    ])
    if sparse_weights is not None:
      sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices,
                                                  sparse_weights.values,
                                                  sparse_ids.dense_shape)

    # Prune invalid ids and weights.
    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
    if combiner != "sum":
      sparse_ids, sparse_weights = _prune_invalid_weights(
          sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(
        sparse_ids, default_id or 0)
    if sparse_weights is not None:
      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)

    result = embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=None if default_id is None else scope,
        max_norm=max_norm)

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.stack([1, array_ops.shape(result)[1]]))

      result = array_ops.where(
          is_row_empty, array_ops.zeros_like(result), result, name=scope)

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(
        result,
        array_ops.concat([
            array_ops.slice(
                math_ops.cast(original_shape, dtypes.int32), [0],
                [original_rank - 1]),
            array_ops.slice(array_ops.shape(result), [1], [-1])
        ], 0))
    final_result.set_shape(
        tensor_shape.unknown_shape(
            (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
                result.get_shape()[1:]))
    return final_result
Beispiel #59
0
  def __init__(self,
               cell,
               embedding,
               start_tokens,
               end_token,
               initial_state,
               beam_width,
               output_layer=None,
               length_penalty_weight=0.0,
               coverage_penalty_weight=0.0,
               reorder_tensor_arrays=True):
    """Initialize the BeamSearchDecoder.

    Args:
      cell: An `RNNCell` instance.
      embedding: A callable that takes a vector tensor of `ids` (argmax ids),
        or the `params` argument for `embedding_lookup`.
      start_tokens: `int32` vector shaped `[batch_size]`, the start tokens.
      end_token: `int32` scalar, the token that marks end of decoding.
      initial_state: A (possibly nested tuple of...) tensors and TensorArrays.
      beam_width:  Python integer, the number of beams.
      output_layer: (Optional) An instance of `tf.layers.Layer`, i.e.,
        `tf.layers.Dense`.  Optional layer to apply to the RNN output prior
        to storing the result or sampling.
      length_penalty_weight: Float weight to penalize length. Disabled with 0.0.
      coverage_penalty_weight: Float weight to penalize the coverage of source
        sentence. Disabled with 0.0.
      reorder_tensor_arrays: If `True`, `TensorArray`s' elements within the cell
        state will be reordered according to the beam search path. If the
        `TensorArray` can be reordered, the stacked form will be returned.
        Otherwise, the `TensorArray` will be returned as is. Set this flag to
        `False` if the cell state contains `TensorArray`s that are not amenable
        to reordering.

    Raises:
      TypeError: if `cell` is not an instance of `RNNCell`,
        or `output_layer` is not an instance of `tf.layers.Layer`.
      ValueError: If `start_tokens` is not a vector or
        `end_token` is not a scalar.
    """
    rnn_cell_impl.assert_like_rnncell("cell", cell)  # pylint: disable=protected-access
    if (output_layer is not None and
        not isinstance(output_layer, layers_base.Layer)):
      raise TypeError(
          "output_layer must be a Layer, received: %s" % type(output_layer))
    self._cell = cell
    self._output_layer = output_layer
    self._reorder_tensor_arrays = reorder_tensor_arrays

    if callable(embedding):
      self._embedding_fn = embedding
    else:
      self._embedding_fn = (
          lambda ids: embedding_ops.embedding_lookup(embedding, ids))

    self._start_tokens = ops.convert_to_tensor(
        start_tokens, dtype=dtypes.int32, name="start_tokens")
    if self._start_tokens.get_shape().ndims != 1:
      raise ValueError("start_tokens must be a vector")
    self._end_token = ops.convert_to_tensor(
        end_token, dtype=dtypes.int32, name="end_token")
    if self._end_token.get_shape().ndims != 0:
      raise ValueError("end_token must be a scalar")

    self._batch_size = array_ops.size(start_tokens)
    self._beam_width = beam_width
    self._length_penalty_weight = length_penalty_weight
    self._coverage_penalty_weight = coverage_penalty_weight
    self._initial_cell_state = nest.map_structure(
        self._maybe_split_batch_beams, initial_state, self._cell.state_size)
    self._start_tokens = array_ops.tile(
        array_ops.expand_dims(self._start_tokens, 1), [1, self._beam_width])
    self._start_inputs = self._embedding_fn(self._start_tokens)

    self._finished = array_ops.one_hot(
        array_ops.zeros([self._batch_size], dtype=dtypes.int32),
        depth=self._beam_width,
        on_value=False,
        off_value=True,
        dtype=dtypes.bool)
Beispiel #60
0
def safe_embedding_lookup_sparse(embedding_weights,
                                 sparse_ids,
                                 sparse_weights=None,
                                 combiner="mean",
                                 default_id=None,
                                 name=None,
                                 partition_strategy="div"):
  """Lookup embedding results, accounting for invalid IDs and empty features.

  The partitioned embedding in `embedding_weights` must all be the same shape
  except for the first dimension. The first dimension is allowed to vary as the
  vocabulary size is not necessarily a multiple of `P`.

  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
  with non-positive weight. For an entry with no features, the embedding vector
  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

  The ids and weights may be multi-dimensional. Embeddings are always aggregated
  along the last dimension.

  Args:
    embedding_weights:  A list of `P` float tensors or values representing
        partitioned embedding tensors.  The total unpartitioned shape should be
        `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size and
        `e_1, ..., e_m` are the embedding dimensions.
    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights
        are be assumed to be 1.0.
    combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
        the default.
    default_id: The id to use for an entry with no features.
    name: A name for this operation (optional).
    partition_strategy: A string specifying the partitioning strategy.
        Currently `"div"` and `"mod"` are supported. Default is `"div"`.


  Returns:
    Dense tensor of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.

  Raises:
    ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None or len(embedding_weights) < 1:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  dtype = sparse_weights.dtype if sparse_weights is not None else None
  embedding_weights = [
      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
  ]

  contrib_tensor_util.assert_same_float_dtype(embedding_weights +
                                              [sparse_weights])

  with ops.op_scope(embedding_weights + [sparse_ids, sparse_weights], name,
                    "embedding_lookup") as scope:
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.shape
    original_rank_dim = sparse_ids.shape.get_shape()[0]
    original_rank = (
        array_ops.size(original_shape)
        if original_rank_dim.value is None
        else original_rank_dim.value)
    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
        math_ops.reduce_prod(
            array_ops.slice(original_shape, [0], [original_rank - 1])),
        array_ops.gather(original_shape, original_rank - 1)])
    if sparse_weights is not None:
      sparse_weights = ops.SparseTensor(sparse_ids.indices,
                                        sparse_weights.values, sparse_ids.shape)

    # Prune invalid ids and weights.
    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
                                                                 default_id or
                                                                 0)
    if sparse_weights is not None:
      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)

    result = tf_embedding_ops.embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=None if default_id is None else scope)

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.pack([1, array_ops.shape(result)[1]]))

      result = math_ops.select(is_row_empty,
                               array_ops.zeros_like(result),
                               result,
                               name=scope)

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(result, array_ops.concat(0, [
        array_ops.slice(
            math_ops.cast(original_shape, dtypes.int32),
            [0], [original_rank - 1]),
        array_ops.slice(array_ops.shape(result), [1], [-1])]))
    final_result.set_shape(tensor_shape.unknown_shape(
        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
    return final_result