def _log_survival_function(self, y):
    low = self._low
    high = self._high

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= high,
    #                       = 1, if y < low,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when low < X < high.
    result_so_far = self.distribution.log_survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if low is not None:
      result_so_far = array_ops.where(j < low,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)
    if high is not None:
      neg_inf = -np.inf * array_ops.ones_like(result_so_far)
      result_so_far = array_ops.where(j >= high, neg_inf, result_so_far)

    return result_so_far
  def _cdf(self, y):
    low = self._low
    high = self._high

    # Recall the promise:
    # cdf(y) := P[Y <= y]
    #         = 1, if y >= high,
    #         = 0, if y < low,
    #         = P[X <= y], otherwise.

    # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
    # between.
    j = math_ops.floor(y)

    # P[X <= j], used when low < X < high.
    result_so_far = self.distribution.cdf(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if low is not None:
      result_so_far = array_ops.where(j < low,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)
    if high is not None:
      result_so_far = array_ops.where(j >= high,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)

    return result_so_far
  def _survival_function(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # survival_function(y) := P[Y > y]
    #                       = 0, if y >= upper_cutoff,
    #                       = 1, if y < lower_cutoff,
    #                       = P[X > y], otherwise.

    # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in
    # between.
    j = math_ops.ceil(y)

    # P[X > j], used when lower_cutoff < X < upper_cutoff.
    result_so_far = self.distribution.survival_function(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      result_so_far = math_ops.select(j < lower_cutoff,
                                      array_ops.ones_like(result_so_far),
                                      result_so_far)
    if upper_cutoff is not None:
      result_so_far = math_ops.select(j >= upper_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
  def _log_cdf(self, y):
    lower_cutoff = self._lower_cutoff
    upper_cutoff = self._upper_cutoff

    # Recall the promise:
    # cdf(y) := P[Y <= y]
    #         = 1, if y >= upper_cutoff,
    #         = 0, if y < lower_cutoff,
    #         = P[X <= y], otherwise.

    # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in
    # between.
    j = math_ops.floor(y)

    result_so_far = self.distribution.log_cdf(j)

    # Broadcast, because it's possible that this is a single distribution being
    # evaluated on a number of samples, or something like that.
    j += array_ops.zeros_like(result_so_far)

    # Re-define values at the cutoffs.
    if lower_cutoff is not None:
      neg_inf = -np.inf * array_ops.ones_like(result_so_far)
      result_so_far = math_ops.select(j < lower_cutoff, neg_inf, result_so_far)
    if upper_cutoff is not None:
      result_so_far = math_ops.select(j >= upper_cutoff,
                                      array_ops.zeros_like(result_so_far),
                                      result_so_far)

    return result_so_far
Example #5
0
def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None):
  """Compute forward-mode gradients."""
  # See b/37888268.

  # This version of forward-mode autodiff is based on code by Tim Cooijmans
  # and handles list arguments and certain special cases such as when the
  # ys doesn't depend on one or more of the xs, and when ops.IndexedSlices are
  # generated by the first gradients_impl.gradients call.

  us = [array_ops.zeros_like(y) + float("nan") for y in ys]
  dydxs = gradients_impl.gradients(
      ys, xs, grad_ys=us, stop_gradients=stop_gradients)

  # Deal with strange types that gradients_impl.gradients returns but can't
  # deal with.
  dydxs = [
      ops.convert_to_tensor(dydx)
      if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs
  ]
  dydxs = [
      array_ops.zeros_like(x) if dydx is None else dydx
      for x, dydx in zip(xs, dydxs)
  ]

  dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs)

  return dysdx
 def Loop(cell, w, i):
   x = array_ops.unstack(i, self.NUM_UNROLL)
   m = array_ops.zeros_like(x[0])
   c = array_ops.zeros_like(x[0])
   for i in range(self.NUM_UNROLL):
     m, c = cell(x[i], m, c, w)
   return m
 def LSTMLoop10(weights, inp):
   x = array_ops.unstack(inp, self.NUM_UNROLL)
   m = array_ops.zeros_like(x[0])
   c = array_ops.zeros_like(x[0])
   assert self.NUM_UNROLL % 10 == 0
   for i in range(0, self.NUM_UNROLL, 10):
     m, c = Loop10(weights, m, c, *x[i:i + 10])
   return m
  def _get_chol_and_x_compatible_shape(self, x):
    """Return self.chol and x, (possibly) broadcast to compatible shape."""
    # x and chol are "compatible" if their shape matches except for the last two
    # dimensions of chol are [k, k], and the last two of x are [k, 1].
    # E.g. x.shape = [A, B, k, 1], and chol.shape = [A, B, k, k]
    # This is required for the batch_triangular_solve, which does not broadcast.

    # TODO(langmore) This broadcast replicates matrices unnecesarily!  In the
    # case where
    # x.shape = [M1,...,Mr, N1,...,Nb, k], and chol.shape = [N1,...,Nb, k, k]
    # (which is common if x was sampled), the front dimensions of x can be
    # "flipped" to the end, making
    # x_flipped.shape = [N1,...,Nb, k, M1*...*Mr],
    # and this can be handled by the linear solvers.  This is preferred, because
    # it does not replicate the matrix, or create any new data.

    # We assume x starts without the trailing singleton dimension, e.g.
    # x.shape = [B, k].
    chol = self._chol
    with ops.op_scope([x] + self.inputs, 'get_chol_and_x_compatible_shape'):
      # If we determine statically that shapes match, we're done.
      if x.get_shape() == chol.get_shape()[:-1]:
        x_expanded = array_ops.expand_dims(x, -1)
        return chol, x_expanded

      # Dynamic check if shapes match or not.
      vector_shape = self.vector_shape()  # Shape of chol minus last dim.
      are_same_rank = math_ops.equal(
          array_ops.rank(x), array_ops.rank(vector_shape))

      def shapes_match_if_same_rank():
        return math_ops.reduce_all(math_ops.equal(
            array_ops.shape(x), vector_shape))

      shapes_match = control_flow_ops.cond(are_same_rank,
                                           shapes_match_if_same_rank,
                                           lambda: ops.convert_to_tensor(False))

      # Make tensors (never instantiated) holding the broadcast shape.
      # matrix_broadcast_dummy is the shape we will broadcast chol to.
      matrix_bcast_dummy = chol + array_ops.expand_dims(x, -1)
      # vector_bcast_dummy is the shape we will bcast x to, before we expand it.
      chol_minus_last_dim = math_ops.reduce_sum(chol, reduction_indices=[-1])
      vector_bcast_dummy = x + chol_minus_last_dim

      chol_bcast = chol + array_ops.zeros_like(matrix_bcast_dummy)
      x_bcast = x + array_ops.zeros_like(vector_bcast_dummy)

      chol_result = control_flow_ops.cond(shapes_match, lambda: chol,
                                          lambda: chol_bcast)
      chol_result.set_shape(matrix_bcast_dummy.get_shape())
      x_result = control_flow_ops.cond(shapes_match, lambda: x, lambda: x_bcast)
      x_result.set_shape(vector_bcast_dummy.get_shape())

      x_expanded = array_ops.expand_dims(x_result, -1)

      return chol_result, x_expanded
  def _forward(self, x):
    if self._unroll_loop:
      event_size = tensor_shape.dimension_value(
          x.shape.with_rank_at_least(1)[-1])
      if event_size is None:
        raise ValueError(
            "The final dimension of `x` must be known at graph construction "
            "time if `unroll_loop=True`. `x.shape: %r`" % x.shape)
      y = array_ops.zeros_like(x, name="y0")

      for _ in range(event_size):
        shift, log_scale = self._shift_and_log_scale_fn(y)
        # next_y = scale * x + shift
        next_y = x
        if log_scale is not None:
          next_y *= math_ops.exp(log_scale)
        if shift is not None:
          next_y += shift
        y = next_y
      return y

    event_size = array_ops.shape(x)[-1]
    # If the event size is available at graph construction time, we can inform
    # the graph compiler of the maximum number of steps. If not,
    # static_event_size will be None, and the maximum_iterations argument will
    # have no effect.
    static_event_size = tensor_shape.dimension_value(
        x.shape.with_rank_at_least(1)[-1])
    y0 = array_ops.zeros_like(x, name="y0")
    # call the template once to ensure creation
    _ = self._shift_and_log_scale_fn(y0)

    def _loop_body(index, y0):
      """While-loop body for autoregression calculation."""
      # Set caching device to avoid re-getting the tf.Variable for every while
      # loop iteration.
      with variable_scope_lib.variable_scope(
          variable_scope_lib.get_variable_scope()) as vs:
        if vs.caching_device is None:
          vs.set_caching_device(lambda op: op.device)
        shift, log_scale = self._shift_and_log_scale_fn(y0)
      y = x
      if log_scale is not None:
        y *= math_ops.exp(log_scale)
      if shift is not None:
        y += shift
      return index + 1, y

    _, y = control_flow_ops.while_loop(
        cond=lambda index, _: index < event_size,
        body=_loop_body,
        loop_vars=(0, y0),
        maximum_iterations=static_event_size)
    return y
Example #10
0
  def _cdf(self, counts):
    counts = self._maybe_assert_valid_sample(counts)
    probs = self.probs
    if not (counts.shape.is_fully_defined()
            and self.probs.shape.is_fully_defined()
            and counts.shape.is_compatible_with(self.probs.shape)):
      # If both shapes are well defined and equal, we skip broadcasting.
      probs += array_ops.zeros_like(counts)
      counts += array_ops.zeros_like(self.probs)

    return _bdtr(k=counts, n=self.total_count, p=probs)
Example #11
0
def fwd_gradients(ys, xs, grad_xs=None, assert_unused=False):
  """Computes forward-mode derivatives.

  This is accomplished in pure-python using tensorflow's existing (reverse-mode)
  gradients. There is additional overhead on graph construction, but runtime
  performance should be equal to a manual implementation [citation needed].

  See https://j-towns.github.io/2017/06/12/A-new-trick.html and
  https://github.com/HIPS/autograd/pull/175 for the original discussion of this
  method, and https://github.com/renmengye/tensorflow-forward-ad for a "direct"
  implementation.

  Args:
    ys: A list of tensors.
    xs: A list of tensors.
    grad_xs: An optional list of tensors. If provided, must have the same length
      and shapes compatible with xs.
    assert_unused: Add assertions that intermediate values are not computed.
  Returns:
    A list of tensors of the same shapes as ys. The directional derivatives of
    ys with respect to xs in the direction grad_xs. Leaving grad_xs unspecified
    is equivalent to passing in 1s for each x in xs.
  """
  # This version of forward-mode autodiff is based on code by Tim Cooijmans
  # and handles list arguments and certain special cases such as when the
  # ys doesn't depend on one or more of the xs, and when tf.IndexedSlices are
  # generated by the first tf.gradients call.

  us = [array_ops.zeros_like(y) + float('nan') for y in ys]

  dydxs = gradients(ys, xs, grad_ys=us)

  # deal with strange types that tf.gradients returns but can't deal with
  dydxs = [ops.convert_to_tensor(dydx) if isinstance(dydx, ops.IndexedSlices)
           else dydx for dydx in dydxs]

  if assert_unused:
    with ops.control_dependencies(dydxs):
      assert_unused = control_flow_ops.Assert(False, [1], name='fwd_gradients')
    with ops.control_dependencies([assert_unused]):
      dydxs = array_ops.identity_n(dydxs)

  dydxs = [array_ops.zeros_like(x) if dydx is None else dydx
           for x, dydx in zip(xs, dydxs)]
  for x, dydx in zip(xs, dydxs):
    dydx.set_shape(x.shape)

  dysdx = gradients(dydxs, us, grad_ys=grad_xs)

  return dysdx
Example #12
0
def sparsemax_loss(logits, sparsemax, labels, name=None):
  """Computes sparsemax loss function [1].

  [1]: https://arxiv.org/abs/1602.02068

  Args:
    logits: A `Tensor`. Must be one of the following types: `half`, `float32`,
      `float64`.
    sparsemax: A `Tensor`. Must have the same type as `logits`.
    labels: A `Tensor`. Must have the same type as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor`. Has the same type as `logits`.
  """

  with ops.name_scope(name, "sparsemax_loss",
                      [logits, sparsemax, labels]) as name:
    logits = ops.convert_to_tensor(logits, name="logits")
    sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax")
    labels = ops.convert_to_tensor(labels, name="labels")

    # In the paper, they call the logits z.
    # A constant can be substracted from logits to make the algorithm
    # more numerically stable in theory. However, there are really no major
    # source numerical instability in this algorithm.
    z = logits

    # sum over support
    # Use a conditional where instead of a multiplication to support z = -inf.
    # If z = -inf, and there is no support (sparsemax = 0), a multiplication
    # would cause 0 * -inf = nan, which is not correct in this case.
    sum_s = array_ops.where(
        math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)),
        sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax))

    # - z_k + ||q||^2
    q_part = labels * (0.5 * labels - z)
    # Fix the case where labels = 0 and z = -inf, where q_part would
    # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for
    # z = -inf should be consideredself.
    # The code below also coveres the case where z = inf. Howeverm in this
    # caose the sparsemax will be nan, which means the sum_s will also be nan,
    # therefor this case doesn't need addtional special treatment.
    q_part_safe = array_ops.where(
        math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)),
        array_ops.zeros_like(z), q_part)

    return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
Example #13
0
  def __init__(self, inputs, sequence_length, time_major=False, name=None):
    """Initializer.

    Args:
      inputs: A (structure of) input tensors.
      sequence_length: An int32 vector tensor.
      time_major: Python bool.  Whether the tensors in `inputs` are time major.
        If `False` (default), they are assumed to be batch major.
      name: Name scope for any created operations.

    Raises:
      ValueError: if `sequence_length` is not a 1D tensor.
    """
    with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]):
      inputs = ops.convert_to_tensor(inputs, name="inputs")
      if not time_major:
        inputs = nest.map_structure(_transpose_batch_time, inputs)

      self._input_tas = nest.map_structure(_unstack_ta, inputs)
      self._sequence_length = ops.convert_to_tensor(
          sequence_length, name="sequence_length")
      if self._sequence_length.get_shape().ndims != 1:
        raise ValueError(
            "Expected sequence_length to be a vector, but received shape: %s" %
            self._sequence_length.get_shape())

      self._zero_inputs = nest.map_structure(
          lambda inp: array_ops.zeros_like(inp[0, :]), inputs)

      self._batch_size = array_ops.size(sequence_length)
Example #14
0
  def _logits_to_prediction(self, logits=None):
    predictions = {}
    predictions[PredictionKey.LOGITS] = logits
    logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits])
    predictions[PredictionKey.CLASSES] = math_ops.argmax(logits, 1)

    return predictions
  def _compareZeros(self, dtype, fully_defined_shape, use_gpu):
    with self.test_session(use_gpu=use_gpu):
      # Creates a tensor of non-zero values with shape 2 x 3.
      # NOTE(kearnes): The default numpy dtype associated with tf.string is
      # np.object (and can't be changed without breaking a lot things), which
      # causes a TypeError in constant_op.constant below. Here we catch the
      # special case of tf.string and set the numpy dtype appropriately.
      if dtype == dtypes_lib.string:
        numpy_dtype = np.string_
      else:
        numpy_dtype = dtype.as_numpy_dtype
      if fully_defined_shape:
        d = constant_op.constant(
            np.ones((2, 3), dtype=numpy_dtype), dtype=dtype)
      else:
        d = array_ops.placeholder(dtype=dtype)
      # Constructs a tensor of zeros of the same dimensions and type as "d".
      z_var = array_ops.zeros_like(d)
      # Test that the type is correct
      self.assertEqual(z_var.dtype, dtype)
      # Test that the shape is correct
      if fully_defined_shape:
        self.assertEqual([2, 3], z_var.get_shape())

      # Test that the value is correct
      feed_dict = {}
      if not fully_defined_shape:
        feed_dict[d] = np.ones((2, 3), dtype=numpy_dtype)
      z_value = z_var.eval(feed_dict=feed_dict)
      self.assertFalse(np.any(z_value))
      self.assertEqual((2, 3), z_value.shape)
Example #16
0
 def _f():
   # Note that there is a race condition here, so we do a best effort
   # updates here. We reset update_in_steps first so that other workers
   # don't duplicate the updates. Also we update cluster_center_vars
   # before resetting total_counts to avoid large updates to
   # cluster_centers_updated based on partially updated
   # cluster_center_vars.
   with ops.control_dependencies([
       state_ops.assign(update_in_steps,
                        self._mini_batch_steps_per_iteration - 1)
   ]):
     with ops.colocate_with(
         cluster_centers_updated, ignore_existing=True):
       if self._distance_metric == COSINE_DISTANCE:
         cluster_centers = nn_impl.l2_normalize(
             cluster_centers_updated, dim=1)
       else:
         cluster_centers = cluster_centers_updated
     with ops.colocate_with(cluster_centers_var, ignore_existing=True):
       with ops.control_dependencies(
           [state_ops.assign(cluster_centers_var, cluster_centers)]):
         with ops.colocate_with(None, ignore_existing=True):
           with ops.control_dependencies([
               state_ops.assign(total_counts,
                                array_ops.zeros_like(total_counts))
           ]):
             return array_ops.identity(update_in_steps)
Example #17
0
  def testZerosLikeUninitialized(self):
    l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32)
    l1 = list_ops.tensor_list_set_item(l0, 0, 1.)  # [1., _, _]
    zeros_1 = array_ops.zeros_like(l1)  # [0., _, _]
    l2 = list_ops.tensor_list_set_item(l1, 2, 2.)  # [1., _, 2.]
    zeros_2 = array_ops.zeros_like(l2)  # [0., _, 0.]

    # Gather indices with zeros in `zeros_1`.
    res_1 = list_ops.tensor_list_gather(
        zeros_1, [0], element_dtype=dtypes.float32)
    # Gather indices with zeros in `zeros_2`.
    res_2 = list_ops.tensor_list_gather(
        zeros_2, [0, 2], element_dtype=dtypes.float32)

    self.assertAllEqual(self.evaluate(res_1), [0.])
    self.assertAllEqual(self.evaluate(res_2), [0., 0.])
Example #18
0
def _num_present(losses, weights, per_batch=False):
  """Computes the number of elements in the loss function induced by `weights`.

  A given weights tensor induces different numbers of usable elements in the
  `losses` tensor. The `weights` tensor is broadcast across `losses` for all
  possible dimensions. For example, if `losses` is a tensor of dimension
  `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is,
  in effect, tiled to match the shape of `losses`. Following this effective
  tile, the total number of present elements is the number of non-zero weights.

  Args:
    losses: `Tensor` of shape `[batch_size, d1, ... dN]`.
    weights: `Tensor` of shape `[]`, `[batch_size]` or
      `[batch_size, d1, ... dK]`, where K < N.
    per_batch: Whether to return the number of elements per batch or as a sum
      total.

  Returns:
    The number of present (non-zero) elements in the losses tensor. If
      `per_batch` is `True`, the value is returned as a tensor of size
      `[batch_size]`. Otherwise, a single scalar tensor is returned.
  """
  with ops.name_scope(None, "num_present", (losses, weights)) as scope:
    weights = math_ops.to_float(weights)
    present = array_ops.where(
        math_ops.equal(weights, 0.0),
        array_ops.zeros_like(weights),
        array_ops.ones_like(weights))
    present = weights_broadcast_ops.broadcast_weights(present, losses)
    if per_batch:
      return math_ops.reduce_sum(
          present, axis=math_ops.range(1, array_ops.rank(present)),
          keep_dims=True, name=scope)
    return math_ops.reduce_sum(present, name=scope)
Example #19
0
def power_sums_tensor(array_size, power_matrix, multiplier):
  r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1).

  Args:
    array_size: The number of non-trivial sums to pre-compute.
    power_matrix: The "A" matrix above.
    multiplier: The "B" matrix above
  Returns:
    A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T
      S[0] is the zero matrix
      S[1] is B
      S[2] is A B A^T + B
      ...and so on
  """
  array_size = math_ops.cast(array_size, dtypes.int32)
  power_matrix = ops.convert_to_tensor(power_matrix)
  identity_like_power_matrix = linalg_ops.eye(
      array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype)
  identity_like_power_matrix.set_shape(
      ops.convert_to_tensor(power_matrix).get_shape())
  transition_powers = functional_ops.scan(
      lambda previous_power, _: math_ops.matmul(previous_power, power_matrix),
      math_ops.range(array_size - 1),
      initializer=identity_like_power_matrix)
  summed = math_ops.cumsum(
      array_ops.concat([
          array_ops.expand_dims(multiplier, 0), math_ops.matmul(
              batch_times_matrix(transition_powers, multiplier),
              transition_powers,
              adjoint_b=True)
      ], 0))
  return array_ops.concat(
      [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
Example #20
0
def _compute_gradients(tensor, var_list):
  grads = gradients.gradients(tensor, var_list)
  # tf.gradients sometimes returns `None` when it should return 0.
  return [
      grad if grad is not None else array_ops.zeros_like(var)
      for var, grad in zip(var_list, grads)
  ]
Example #21
0
def gcd(a, b, name=None):
  """Returns the greatest common divisor via Euclid's algorithm.

  Args:
    a: The dividend. A scalar integer `Tensor`.
    b: The divisor. A scalar integer `Tensor`.
    name: An optional name for the operation.

  Returns:
    A scalar `Tensor` representing the greatest common divisor between `a` and
    `b`.

  Raises:
    ValueError: If `a` or `b` are not scalar integers.
  """
  with ops.name_scope(name, 'gcd', [a, b]):
    a = ops.convert_to_tensor(a)
    b = ops.convert_to_tensor(b)

    a.shape.assert_has_rank(0)
    b.shape.assert_has_rank(0)

    if not a.dtype.is_integer:
      raise ValueError('a must be an integer type. Got: %s' % a.dtype)
    if not b.dtype.is_integer:
      raise ValueError('b must be an integer type. Got: %s' % b.dtype)

    cond = lambda _, b: math_ops.greater(b, array_ops.zeros_like(b))
    body = lambda a, b: [b, math_ops.mod(a, b)]
    a, b = control_flow_ops.while_loop(cond, body, [a, b], back_prop=False)
    return a
Example #22
0
    def Grad(op, *args):
      """The python grad function for the Forward function."""

      # NOTE: tf.gradient backprops None for int32/int64 while zeros
      # for float32/float64. For consistency, we always backprop
      # zeros.
      args = list(args)
      for i, dy in enumerate(args):
        if dy is None:
          args[i] = array_ops.zeros_like(op.outputs[i])
      # TODO(drpng): getting the extra state here?
      op_inputs = [x for x in op.inputs]
      op_struct = [
          self._theta, self._state, self._inputs, self._max_input_length,
          self._extras
      ]
      (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct)
      # acc_state and acc_extras are computed by the Forward pass and
      # needed by the Backward pass.
      acc_state, _, acc_extras = _Pack([x for x in op.outputs],
                                       [self._state, self._state, self._extras])

      # Forward computes acc_state, the final state and
      # acc_extras. tf.gradients gives us their gradients w.r.t. the
      # final loss. Because acc_extras are not exposed by Compute(),
      # it has no gradients w.r.t. the final loss (i.e., by
      # construction, it must be zeros).
      d_acc_state, d_state1, _ = _Pack(args,
                                       [self._state, self._state, self._extras])
      return Backward(*_Flatten([
          theta, state0, inputs, max_input_length, acc_state, acc_extras,
          d_acc_state, d_state1
      ]))
Example #23
0
 def _single_seq_fn():
   log_norm = math_ops.reduce_logsumexp(first_input, [1])
   # Mask `log_norm` of the sequences with length <= zero.
   log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0),
                              array_ops.zeros_like(log_norm),
                              log_norm)
   return log_norm
Example #24
0
def _TensorScatterUpdateGrad(op, grad):
  indices = op.inputs[1]
  updates_grad = array_ops.gather_nd(grad, indices)
  tensor_grad = array_ops.tensor_scatter_update(
      array_ops.identity(grad), indices,
      array_ops.zeros_like(op.inputs[2], dtype=grad.dtype))
  return [tensor_grad, None, updates_grad]
 def _forward(self, x):
   event_size = array_ops.shape(x)[-1]
   y0 = array_ops.zeros_like(x, name="y0")
   # call the template once to ensure creation
   _ = self._shift_and_log_scale_fn(y0)
   def _loop_body(index, y0):
     """While-loop body for autoregression calculation."""
     # Set caching device to avoid re-getting the tf.Variable for every while
     # loop iteration.
     with variable_scope_lib.variable_scope(
         variable_scope_lib.get_variable_scope()) as vs:
       if vs.caching_device is None:
         vs.set_caching_device(lambda op: op.device)
       shift, log_scale = self._shift_and_log_scale_fn(y0)
     y = x
     if log_scale is not None:
       y *= math_ops.exp(log_scale)
     if shift is not None:
       y += shift
     return index + 1, y
   _, y = control_flow_ops.while_loop(
       cond=lambda index, _: index < event_size,
       body=_loop_body,
       loop_vars=[0, y0])
   return y
Example #26
0
  def full_exp_with_logits(name, labels=None, logits=None):
    """Computes exponential loss given `logits`.

    Args:
      name: A name for the operation (optional).
      labels: A `Tensor` of the same type and shape as `logits`.
      logits: A `Tensor` of type `float32` or `float64`.

    Returns:
      A `Tensor` of the same shape as `logits` with the componentwise
      exponential losses.

    Raises:
      ValueError: If `logits` and `labels` do not have the same shape.
    """
    with ops.name_scope(name, "exp_loss", [logits, labels]) as name:
      logits = ops.convert_to_tensor(logits, name="logits")
      labels = ops.convert_to_tensor(labels, name="labels")
      try:
        labels.get_shape().merge_with(logits.get_shape())
      except ValueError:
        raise ValueError("logits and labels must have the same shape (%s vs %s)"
                         % (logits.get_shape(), labels.get_shape()))

    # Default threshold of 0 to switch between classes
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    ones = array_ops.ones_like(logits, dtype=logits.dtype)
    neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype)

    # Convert labels to 1 and -1
    cond_labels = (labels > zeros)
    labels_converted = array_ops.where(cond_labels, ones, neg_ones)

    return math_ops.exp(-1.0 * logits * labels_converted)
Example #27
0
def _SparseDenseCwiseMulOrDivGrad(op, grad, is_mul):
  """Common code for SparseDenseCwise{Mul,Div} gradients."""
  x_indices = op.inputs[0]
  x_shape = op.inputs[2]
  y = op.inputs[3]

  y_shape = math_ops.to_int64(array_ops.shape(y))
  num_added_dims = array_ops.expand_dims(
      array_ops.size(x_shape) - array_ops.size(y_shape), 0)
  augmented_y_shape = array_ops.concat(
      [array_ops.ones(num_added_dims, ops.dtypes.int64), y_shape], 0)

  scaling = x_shape // augmented_y_shape
  scaled_indices = x_indices // scaling
  scaled_indices = array_ops.slice(scaled_indices,
                                   array_ops.concat([[0], num_added_dims], 0),
                                   [-1, -1])
  dense_vals = array_ops.gather_nd(y, scaled_indices)

  if is_mul:
    dx = grad * dense_vals
    dy_val = grad * op.inputs[1]
  else:
    dx = grad / dense_vals
    dy_val = grad * (-op.inputs[1] / math_ops.square(dense_vals))
  # indices can repeat after scaling, so we can't use sparse_to_dense().
  dy = sparse_ops.sparse_add(
      array_ops.zeros_like(y),
      sparse_tensor.SparseTensor(scaled_indices, dy_val, y_shape))

  # (sp_indices, sp_vals, sp_shape, dense)
  return (None, dx, None, dy)
Example #28
0
def _safe_div(numerator, denominator, name="value"):
  """Computes a safe divide which returns 0 if the denominator is zero.

  Note that the function contains an additional conditional check that is
  necessary for avoiding situations where the loss is zero causing NaNs to
  creep into the gradient computation.

  Args:
    numerator: An arbitrary `Tensor`.
    denominator: A `Tensor` whose shape matches `numerator` and whose values are
      assumed to be non-negative.
    name: An optional name for the returned op.

  Returns:
    The element-wise value of the numerator divided by the denominator.
  """
  if compat.forward_compatible(2018, 11, 1):
    return math_ops.div_no_nan(numerator, denominator, name=name)
  return array_ops.where(
      math_ops.greater(denominator, 0),
      math_ops.div(numerator,
                   array_ops.where(
                       math_ops.equal(denominator, 0),
                       array_ops.ones_like(denominator), denominator)),
      array_ops.zeros_like(numerator),
      name=name)
Example #29
0
  def pdf(self, x, name="pdf"):
    """The PDF of observations in `x` under these Uniform distribution(s).

    Args:
      x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`.
      name: The name to give this op.

    Returns:
      pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will
          return `nan`.
    """
    with ops.name_scope(self.name):
      with ops.op_scope([self.a, self.b, x], name):
        x = ops.convert_to_tensor(x, name="x")
        if x.dtype != self.dtype:
          raise TypeError("Input x dtype does not match dtype: %s vs. %s" %
                          (x.dtype, self.dtype))

        broadcasted_x = x * self._ones()
        return math_ops.select(
            math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select(
                math_ops.logical_or(broadcasted_x < self.a,
                                    broadcasted_x > self.b),
                array_ops.zeros_like(broadcasted_x),
                (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
Example #30
0
 def testMultipleOfNoneGradRaisesError(self):
   gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
   variable = variables_lib.Variable(array_ops.zeros_like(gradient))
   grad_to_var = (None, variable)
   gradient_multipliers = {variable: self._multiplier}
   with self.assertRaises(ValueError):
     learning.multiply_gradients(grad_to_var, gradient_multipliers)
Example #31
0
  def ZerosLikeV1WhileLoop(self, op, index):
    """Create zeros_like for the specified output of an op.

    If op is in a while loop that is part of gradients(), this method
    must be called in its grad loop context.

    Args:
      op: A tensorflow operation.
      index: the index for a specific output of the op.

    Returns:
      A zero tensor of the same shape of op.outputs[index].
    """
    if util.IsLoopSwitch(op):
      return None
    if op.graph.building_function:
      # The optimization here is tricky to apply to functions
      return array_ops.zeros_like(op.outputs[index])
    dead_branch = util.IsSwitch(op)
    forward_ctxt = util.GetWhileContext(op)
    grad_state = self._map.get(forward_ctxt)
    if grad_state is None:
      # op is not in a while loop that is part of gradients().
      return ZerosLike(op, index)
    op_ctxt = op._get_control_flow_context()
    val = ops.convert_to_tensor(op.outputs[index], name="tensor")
    shape = val.get_shape()
    if shape.is_fully_defined():
      # If the shape is known statically, just create a zero tensor with
      # the right shape in the grad loop context.
      if val.dtype == dtypes.resource:
        result = array_ops.zeros(
            resource_variable_ops.variable_shape(val),
            dtype=default_gradient.get_zeros_dtype(val))
      else:
        result = constant_op.constant(0, shape=shape.dims, dtype=val.dtype)
      if dead_branch:
        # op is a cond switch. Guard the zero tensor with a switch.
        pred = grad_state.history_map.get(op_ctxt.pred.name)
        branch = op_ctxt.branch
        result = control_flow_ops._SwitchRefOrTensor(result, pred)[1 - branch]
    else:
      # Unknown shape so keep a history of the shape at runtime.
      if dead_branch:
        # Need to add a special switch to guard the value.
        pred = op_ctxt.pred
        branch = op_ctxt.branch
        op_ctxt.outer_context.Enter()
        val = control_flow_ops._SwitchRefOrTensor(op.inputs[0],
                                                  pred)[1 - branch]
        zeros_shape = array_ops.shape_internal(val, optimize=False)
        op_ctxt.outer_context.Exit()
        val.op._set_control_flow_context(op_ctxt)
        zeros_shape.op._set_control_flow_context(op_ctxt)
      else:
        op_ctxt.Enter()
        zeros_shape = array_ops.shape_internal(val, optimize=False)
        op_ctxt.Exit()

      # Add forward accumulator for shape.
      grad_state.grad_context.Exit()
      history_zeros_shape = grad_state.AddForwardAccumulator(
          zeros_shape, dead_branch=dead_branch)
      grad_state.grad_context.Enter()

      # Create a zero tensor with the right shape.
      shape = grad_state.AddBackpropAccumulatedValue(history_zeros_shape,
                                                     zeros_shape, dead_branch)
      result = array_ops.zeros(shape, val.dtype)
    return result
Example #32
0
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(),
           x_log_prob=None, x_grad=None, skip_metropolis_step=False, name=None):
  """Runs one iteration of Hamiltonian Monte Carlo.

  Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC)
  algorithm that takes a series of gradient-informed steps to produce
  a Metropolis proposal. This function applies one step of HMC to
  randomly update the variable `x`.

  This function can update multiple chains in parallel. It assumes
  that all dimensions of `x` not specified in `event_dims` are
  independent, and should therefore be updated independently. The
  output of `target_log_prob_fn()` should sum log-probabilities across
  all event dimensions. Slices along dimensions not in `event_dims`
  may have different target distributions; for example, if
  `event_dims == (1,)`, then `x[0, :]` could have a different target
  distribution from x[1, :]. This is up to `target_log_prob_fn()`.

  Args:
    step_size: Scalar step size or array of step sizes for the
      leapfrog integrator. Broadcasts to the shape of
      `x`. Larger step sizes lead to faster progress, but
      too-large step sizes make rejection exponentially more likely.
      When possible, it's often helpful to match per-variable step
      sizes to the standard deviations of the target distribution in
      each variable.
    n_leapfrog_steps: Integer number of steps to run the leapfrog
      integrator for. Total progress per HMC step is roughly
      proportional to step_size * n_leapfrog_steps.
    x: Tensor containing the value(s) of the random variable(s) to update.
    target_log_prob_fn: Python callable which takes an argument like `initial_x`
      and returns its (possibly unnormalized) log-density under the target
      distribution.
    event_dims: List of dimensions that should not be treated as
      independent. This allows for multiple chains to be run independently
      in parallel. Default is (), i.e., all dimensions are independent.
    x_log_prob (optional): Tensor containing the cached output of a previous
      call to `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    x_grad (optional): Tensor containing the cached gradient of
      `target_log_prob_fn()` evaluated at `x` (such as that provided by
      a previous call to `kernel()`). Providing `x_log_prob` and
      `x_grad` saves one gradient computation per call to `kernel()`.
    skip_metropolis_step (optional): boolean specifying whether to skip the
      Metropolis-Hastings step and directly return the newly proposed values
      by the integrator. The acceptance probabilities returned remain unchanged.
    name: Python `str` name prefixed to Ops created by this function.

  Returns:
    updated_x: The updated variable(s) x. Has shape matching `initial_x`.
    acceptance_probs: Tensor with the acceptance probabilities for the final
      iteration. This is useful for diagnosing step size problems etc. Has
      shape matching `target_log_prob_fn(initial_x)`.
    new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`.
    new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at
      `updated_x`.

  #### Examples:

  ```python
  # Tuning acceptance rates:
  target_accept_rate = 0.631
  def target_log_prob(x):
    # Standard normal
    return tf.reduce_sum(-0.5 * tf.square(x))
  initial_x = tf.zeros([10])
  initial_log_prob = target_log_prob(initial_x)
  initial_grad = tf.gradients(initial_log_prob, initial_x)[0]
  # Algorithm state
  x = tf.Variable(initial_x, name='x')
  step_size = tf.Variable(1., name='step_size')
  last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob')
  last_grad = tf.Variable(initial_grad, name='last_grad')
  # Compute updates
  new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x,
                                                      target_log_prob,
                                                      event_dims=[0],
                                                      x_log_prob=last_log_prob)
  x_update = tf.assign(x, new_x)
  log_prob_update = tf.assign(last_log_prob, log_prob)
  grad_update = tf.assign(last_grad, grad)
  step_size_update = tf.assign(step_size,
                               tf.where(acceptance_prob > target_accept_rate,
                                        step_size * 1.01, step_size / 1.01))
  adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update]
  sampling_updates = [x_update, log_prob_update, grad_update]

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  # Warm up the sampler and adapt the step size
  for i in xrange(500):
    sess.run(adaptive_updates)
  # Collect samples without adapting step size
  samples = np.zeros([500, 10])
  for i in xrange(500):
    x_val, _ = sess.run([new_x, sampling_updates])
    samples[i] = x_val
  ```

  ```python
  # Empirical-Bayes estimation of a hyperparameter by MCMC-EM:

  # Problem setup
  N = 150
  D = 10
  x = np.random.randn(N, D).astype(np.float32)
  true_sigma = 0.5
  true_beta = true_sigma * np.random.randn(D).astype(np.float32)
  y = x.dot(true_beta) + np.random.randn(N).astype(np.float32)

  def log_prior(beta, log_sigma):
    return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) -
                         log_sigma)
  def regression_log_joint(beta, log_sigma, x, y):
    # This function returns log p(beta | log_sigma) + log p(y | x, beta).
    means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True)
    means = tf.squeeze(means)
    log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means))
    return log_prior(beta, log_sigma) + log_likelihood
  def log_joint_partial(beta):
    return regression_log_joint(beta, log_sigma, x, y)
  # Our estimate of log(sigma)
  log_sigma = tf.Variable(0., name='log_sigma')
  # The state of the Markov chain
  beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta')
  new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial,
                                 event_dims=[0])
  beta_update = tf.assign(beta, new_beta)
  optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01)
  with tf.control_dependencies([beta_update]):
    log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma),
                                          var_list=[log_sigma])

  sess = tf.Session()
  sess.run(tf.global_variables_initializer())
  log_sigma_history = np.zeros(1000)
  for i in xrange(1000):
    log_sigma_val, _ = sess.run([log_sigma, log_sigma_update])
    log_sigma_history[i] = log_sigma_val
  # Should converge to something close to true_sigma
  plt.plot(np.exp(log_sigma_history))
  ```
  """
  with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]):
    potential_and_grad = _make_potential_and_grad(target_log_prob_fn)
    x = ops.convert_to_tensor(x, name='x')

    x_shape = array_ops.shape(x)
    m = random_ops.random_normal(x_shape, dtype=x.dtype)

    kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims)

    if (x_log_prob is not None) and (x_grad is not None):
      log_potential_0, grad_0 = -x_log_prob, -x_grad  # pylint: disable=invalid-unary-operand-type
    else:
      if x_log_prob is not None:
        logging.warn('x_log_prob was provided, but x_grad was not,'
                     ' so x_log_prob was not used.')
      if x_grad is not None:
        logging.warn('x_grad was provided, but x_log_prob was not,'
                     ' so x_grad was not used.')
      log_potential_0, grad_0 = potential_and_grad(x)

    new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator(
        step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0)

    kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims)

    energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0
    # Treat NaN as infinite energy (and therefore guaranteed rejection).
    energy_change = array_ops.where(
        math_ops.is_nan(energy_change),
        array_ops.fill(array_ops.shape(energy_change),
                       energy_change.dtype.as_numpy_dtype(np.inf)),
        energy_change)
    acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.))
    
    # If we are skipping the MH step directly return
    if skip_metropolis_step:
      return new_x, acceptance_probs, -log_potential_1, -grad_1
    
    accepted = (
        random_ops.random_uniform(
            array_ops.shape(acceptance_probs), dtype=x.dtype)
        < acceptance_probs)
    new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0)

    # TODO(b/65738010): This should work, but it doesn't for now.
    # reduced_shape = math_ops.reduced_shape(x_shape, event_dims)
    reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims,
                                                        keep_dims=True))
    accepted = array_ops.reshape(accepted, reduced_shape)
    accepted = math_ops.logical_or(
        accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool))
    new_x = array_ops.where(accepted, new_x, x)
    new_grad = -array_ops.where(accepted, grad_1, grad_0)

  # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect
  # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate).  This
  # should be fixed.
  return new_x, acceptance_probs, new_log_prob, new_grad
Example #33
0
def safe_embedding_lookup_sparse(embedding_weights,
                                 sparse_ids,
                                 sparse_weights=None,
                                 combiner="mean",
                                 default_id=None,
                                 name=None,
                                 partition_strategy="div"):
  """Lookup embedding results, accounting for invalid IDs and empty features.

  The partitioned embedding in `embedding_weights` must all be the same shape
  except for the first dimension. The first dimension is allowed to vary as the
  vocabulary size is not necessarily a multiple of `P`.

  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
  with non-positive weight. For an entry with no features, the embedding vector
  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

  The ids and weights may be multi-dimensional. Embeddings are always aggregated
  along the last dimension.

  Args:
    embedding_weights:  A list of `P` float tensors or values representing
        partitioned embedding tensors.  The total unpartitioned shape should be
        `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size and
        `e_1, ..., e_m` are the embedding dimensions.
    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights
        are be assumed to be 1.0.
    combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean"
        the default.
    default_id: The id to use for an entry with no features.
    name: A name for this operation (optional).
    partition_strategy: A string specifying the partitioning strategy.
        Currently `"div"` and `"mod"` are supported. Default is `"div"`.


  Returns:
    Dense tensor of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.

  Raises:
    ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None or len(embedding_weights) < 1:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  dtype = sparse_weights.dtype if sparse_weights is not None else None
  embedding_weights = [
      ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights
  ]

  contrib_tensor_util.assert_same_float_dtype(embedding_weights +
                                              [sparse_weights])

  with ops.op_scope(embedding_weights + [sparse_ids, sparse_weights], name,
                    "embedding_lookup") as scope:
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.shape
    original_rank_dim = sparse_ids.shape.get_shape()[0]
    original_rank = (
        array_ops.size(original_shape)
        if original_rank_dim.value is None
        else original_rank_dim.value)
    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
        math_ops.reduce_prod(
            array_ops.slice(original_shape, [0], [original_rank - 1])),
        array_ops.gather(original_shape, original_rank - 1)])
    if sparse_weights is not None:
      sparse_weights = ops.SparseTensor(sparse_ids.indices,
                                        sparse_weights.values, sparse_ids.shape)

    # Prune invalid ids and weights.
    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids,
                                                                 default_id or
                                                                 0)
    if sparse_weights is not None:
      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)

    result = tf_embedding_ops.embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=None if default_id is None else scope)

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.pack([1, array_ops.shape(result)[1]]))

      result = math_ops.select(is_row_empty,
                               array_ops.zeros_like(result),
                               result,
                               name=scope)

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(result, array_ops.concat(0, [
        array_ops.slice(
            math_ops.cast(original_shape, dtypes.int32),
            [0], [original_rank - 1]),
        array_ops.slice(array_ops.shape(result), [1], [-1])]))
    final_result.set_shape(tensor_shape.unknown_shape(
        (original_rank_dim - 1).value).concatenate(result.get_shape()[1:]))
    return final_result
Example #34
0
  def _apply_gradient(self, grad, var, indices=None):
    """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
    global_step = self._global_step + 1

    # Update accumulated weighted average of gradients
    gbar = self.get_slot(var, "gbar")
    gbar_decay_t = GetParam(self._gbar_decay, global_step)
    gbar_weight_t = GetParam(self._gbar_weight, global_step)
    if indices is not None:
      # Note - the sparse update is not easily implemented, since the
      # algorithm needs all indices of gbar to be updated
      # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
      # One way to make mat_gbar_decay = 1 is by rescaling.
      # If we want the update:
      #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
      # define:
      #         r_{t+1} = a_{t+1} * r_t
      #         h_t = G_t / r_t
      # Then:
      #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
      # So we get the mat_gbar_decay = 1 as desired.
      # We can implement this in a future version as needed.
      # However we still need gbar_decay = 0, otherwise all indices
      # of the variable will need to be updated.
      if self._gbar_decay != 0.0:
        tf_logging.warning("Not applying momentum for variable: %s" % var.name)
      gbar_updated = grad
    else:
      gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                            gbar_decay_t,
                                            gbar_weight_t * grad)

    # Update the preconditioners and compute the preconditioned gradient
    shape = var.get_shape()
    mat_g_list = []
    for i in range(len(shape)):
      mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
    mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
    mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

    preconditioned_grad = gbar_updated
    v_rank = len(mat_g_list)
    neg_alpha = - GetParam(self._alpha, global_step) / v_rank
    svd_interval = GetParam(self._svd_interval, global_step)
    precond_update_interval = GetParam(self._precond_update_interval,
                                       global_step)
    for i, mat_g in enumerate(mat_g_list):
      # axes is the list of indices to reduce - everything but the current i.
      axes = list(range(i)) + list(range(i+1, v_rank))
      if shape[i] <= self._max_matrix_size:
        # If the tensor size is sufficiently small perform full Shampoo update
        # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
        # is not strictly correct. However we will use it for now, and
        # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

        # pylint: disable=g-long-lambda,cell-var-from-loop
        mat_g_updated = control_flow_ops.cond(
            math_ops.mod(global_step, precond_update_interval) < 1,
            lambda: self._update_mat_g(
                mat_g, grad, axes, mat_gbar_decay_t,
                mat_gbar_weight_t * precond_update_interval, i),
            lambda: mat_g)

        mat_g_updated = mat_g_updated / float(shape[i].value)

        if self._svd_interval == 1:
          mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha)
        else:
          mat_h = control_flow_ops.cond(
              math_ops.mod(global_step, svd_interval) < 1,
              lambda: self._compute_power(var, mat_g_updated, shape[i],
                                          neg_alpha, "H_" + str(i)),
              lambda: self.get_slot(var, "H_" + str(i)))

        # mat_h is a square matrix of size d_i x d_i
        # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
        # After contraction with a d_i x d_i tensor
        # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
        # (the first dimension is contracted out, and the second dimension of
        # mat_h is appended).  After going through all the indices, it becomes
        # a d_0 x ... x d_n tensor again.
        preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h,
                                                 axes=([0], [0]),
                                                 name="precond_" + str(i))
      else:
        # Tensor size is too large -- perform diagonal Shampoo update
        # Only normalize non-vector cases.
        if axes:
          normalizer = 1.0 if indices is not None else float(shape[i].value)
          grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) / normalizer
        else:
          grad_outer = grad * grad

        if i == 0 and indices is not None:
          assert self._mat_gbar_decay == 1.0
          mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                mat_gbar_weight_t * grad_outer)
          mat_g_updated_slice = array_ops.gather(mat_g_updated, indices)
          mat_h = array_ops.where(
              math_ops.greater(mat_g_updated_slice, 0),
              math_ops.pow(mat_g_updated_slice, neg_alpha),
              array_ops.zeros_like(mat_g_updated_slice))
        else:
          mat_g_updated = self._weighted_average(mat_g,
                                                 self._mat_gbar_decay,
                                                 mat_gbar_decay_t,
                                                 mat_gbar_weight_t * grad_outer)
          mat_h = array_ops.where(
              math_ops.greater(mat_g_updated, 0),
              math_ops.pow(mat_g_updated, neg_alpha),
              array_ops.zeros_like(mat_g_updated))

        # Need to do the transpose to ensure that the tensor becomes
        # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
        preconditioned_grad = array_ops.transpose(
            preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h

    # Update the variable based on the Shampoo update
    learning_rate_t = GetParam(self._learning_rate, global_step)
    if indices is not None:
      var_updated = state_ops.scatter_add(
          var, indices, -learning_rate_t * preconditioned_grad)
    else:
      var_updated = state_ops.assign_sub(var,
                                         learning_rate_t * preconditioned_grad)
    return var_updated
Example #35
0
def TaylorExp(logits, labels, perturbW):
    """You can also add L2Loss to all the trainable variables.
        Add summary for "Loss" and "Loss/avg".
        Args:
        logits: Logits from inference().
        labels: Labels from distorted_inputs or inputs().
    
        Returns:
        Loss tensor of type float.
        """
    # Calculate the average cross entropy loss across the batch.
    labels = tf.cast(labels, tf.float32)
    # Differentially private sparse cross entropy error based on Taylor Expansion
    '''
        Computes differentially private sigmoid cross entropy given `logits`.
        
        Measures the probability error in discrete classification tasks in which each
        class is independent and not mutually exclusive.
        
        For brevity, let `x = logits`, `z = labels`.  The logistic loss is
        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
        = (1 - z) * x + log(1 + exp(-x))
        = x - x * z + log(1 + exp(-x))
        
        For x < 0, to avoid overflow in exp(-x), we reformulate the above
        
        x - x * z + log(1 + exp(-x))
        = log(exp(x)) - x * z + log(1 + exp(-x))
        = - x * z + log(1 + exp(x))
        
        Hence, to ensure stability and avoid overflow, the implementation uses this
        equivalent formulation
        
        max(x, 0) - x * z + log(1 + exp(-abs(x)))
        
        `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(x) = -abs(h * W). By Applying Taylor Expansion, we have:
        
        Taylor = max(x, 0) - x * z + log(1 + exp(-abs(x)));
        = max(h * W, 0) - (z * h) * W + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2)
        = max(h * W, 0) - (z * h) * W + (math.log(2.0) + 0.5*(-abs(h * W)) + 1.0/8.0*(-abs(h * W))**2)
        = F1 + F2
        where: F1 = max(h * W, 0) + (math.log(2.0) + 0.5*(-abs(h * W)) + 1.0/8.0*(-abs(h * W))**2) and F2 = - (z * h) * W
        
        To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the terms h * W, z * h * W.
        Note that h is differentially private, since its computation on top of the DP Affine transformation does not access the original data.
        Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label z, as follows:
        
        By applying Funtional Mechanism, we perturb (z * h) * W as ((z * h) + perturbW) * W = (z * h) * W + (perturbW * W):
    
        perturbW = np.random.laplace(0.0, scale3, hk * 10)
        perturbW = np.reshape(perturbW, [hk, 10]);
          
        where scale3 = Delta3/(epsilon3*L) = 10*(hk + 1/4 * hk**2)/(epsilon3*L); (Lemma 5)
        
        To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow].
        
        Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow
        '''
    zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
    cond = (logits >= zeros)
    relu_logits = array_ops.where(cond, logits, zeros)
    neg_abs_logits = array_ops.where(cond, -logits, logits)
    #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits)))
    Taylor = math_ops.add(
        relu_logits - logits * labels,
        math.log(2.0) + 0.5 * neg_abs_logits + 1.0 / 8.0 * neg_abs_logits**2)

    zeros1 = array_ops.zeros_like(perturbW, dtype=perturbW.dtype)
    cond1 = (perturbW >= zeros1)
    perturbW = array_ops.where(cond1, perturbW, -perturbW)

    cross_entropy_mean = tf.reduce_mean(
        Taylor, name='cross_entropy') + tf.reduce_mean(perturbW,
                                                       name='perturbW')

    tf.add_to_collection('losses', cross_entropy_mean)

    return tf.add_n(tf.get_collection('losses'), name='total_loss')
Example #36
0
 def call(self, inputs, training=None):
   if training is None:
     training = keras.backend.learning_phase()
   return tf_utils.smart_cond(training,
                              lambda: array_ops.ones_like(inputs),
                              lambda: array_ops.zeros_like(inputs))
Example #37
0
 def call(self, inputs):
   return keras.backend.in_train_phase(
       lambda: array_ops.ones_like(inputs),
       lambda: array_ops.zeros_like(inputs))
def random_gamma(shape,
                 alpha,
                 beta=None,
                 dtype=dtypes.float32,
                 seed=None,
                 name=None):
    """Draws `shape` samples from each of the given Gamma distribution(s).

  `alpha` is the shape parameter describing the distribution(s), and `beta` is
  the inverse scale parameter(s).

  Note: Because internal calculations are done using `float64` and casting has
  `floor` semantics, we must manually map zero outcomes to the smallest
  possible positive floating-point value, i.e., `np.finfo(dtype).tiny`.  This
  means that `np.finfo(dtype).tiny` occurs more frequently than it otherwise
  should.  This bias can only happen for small values of `alpha`, i.e.,
  `alpha << 1` or large values of `beta`, i.e., `beta >> 1`.

  The samples are differentiable w.r.t. alpha and beta.
  The derivatives are computed using the approach described in the paper

  [Michael Figurnov, Shakir Mohamed, Andriy Mnih.
  Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498)

  Example:

  ```python
  samples = tf.random_gamma([10], [0.5, 1.5])
  # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents
  # the samples drawn from each distribution

  samples = tf.random_gamma([7, 5], [0.5, 1.5])
  # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1]
  # represents the 7x5 samples drawn from each of the two distributions

  alpha = tf.constant([[1.],[3.],[5.]])
  beta = tf.constant([[3., 4.]])
  samples = tf.random_gamma([30], alpha=alpha, beta=beta)
  # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions.

  loss = tf.reduce_mean(tf.square(samples))
  dloss_dalpha, dloss_dbeta = tf.gradients(loss, [alpha, beta])
  # unbiased stochastic derivatives of the loss function
  alpha.shape == dloss_dalpha.shape  # True
  beta.shape == dloss_dbeta.shape  # True
  ```

  Args:
    shape: A 1-D integer Tensor or Python array. The shape of the output samples
      to be drawn per alpha/beta-parameterized distribution.
    alpha: A Tensor or Python value or N-D array of type `dtype`. `alpha`
      provides the shape parameter(s) describing the gamma distribution(s) to
      sample. Must be broadcastable with `beta`.
    beta: A Tensor or Python value or N-D array of type `dtype`. Defaults to 1.
      `beta` provides the inverse scale parameter(s) of the gamma
      distribution(s) to sample. Must be broadcastable with `alpha`.
    dtype: The type of alpha, beta, and the output: `float16`, `float32`, or
      `float64`.
    seed: A Python integer. Used to create a random seed for the distributions.
      See
      `tf.set_random_seed`
      for behavior.
    name: Optional name for the operation.

  Returns:
    samples: a `Tensor` of shape
      `tf.concat([shape, tf.shape(alpha + beta)], axis=0)` with values of type
      `dtype`.
  """
    with ops.name_scope(name, "random_gamma", [shape, alpha, beta]):
        shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32)
        alpha = ops.convert_to_tensor(alpha, name="alpha", dtype=dtype)
        beta = ops.convert_to_tensor(beta if beta is not None else 1,
                                     name="beta",
                                     dtype=dtype)
        alpha_broadcast = alpha + array_ops.zeros_like(beta)
        seed1, seed2 = random_seed.get_seed(seed)
        return math_ops.maximum(
            np.finfo(dtype.as_numpy_dtype).tiny,
            gen_random_ops.random_gamma(
                shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
Example #39
0
def mean_pairwise_squared_error(predictions,
                                labels=None,
                                weights=1.0,
                                scope=None):
    """Adds a pairwise-errors-squared loss to the training procedure.

  Unlike `mean_squared_error`, which is a measure of the differences between
  corresponding elements of `predictions` and `labels`,
  `mean_pairwise_squared_error` is a measure of the differences between pairs of
  corresponding elements of `predictions` and `labels`.

  For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are
  three pairs of differences are summed to compute the loss:
    loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3

  Note that since the inputs are of size [batch_size, d0, ... dN], the
  corresponding pairs are computed within each batch sample but not across
  samples within a batch. For example, if `predictions` represents a batch of
  16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs
  is drawn from each image, but not across images.

  `weights` acts as a coefficient for the loss. If a scalar is provided, then
  the loss is simply scaled by the given value. If `weights` is a tensor of size
  [batch_size], then the total loss for each sample of the batch is rescaled
  by the corresponding element in the `weights` vector.

  Args:
    predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN]
      where N+1 is the total number of dimensions in `predictions`.
    labels: The ground truth output tensor, whose shape must match the shape of
      the `predictions` tensor.
    weights: Coefficients for the loss a scalar, a tensor of shape [batch_size]
      or a tensor whose shape matches `predictions`.
    scope: The scope for the operations performed in computing the loss.

  Returns:
    A scalar `Tensor` representing the loss value.

  Raises:
    ValueError: If the shape of `predictions` doesn't match that of `labels` or
      if the shape of `weights` is invalid.
  """
    with ops.name_scope(scope, "mean_pairwise_squared_error",
                        [predictions, labels, weights]) as scope:
        predictions.get_shape().assert_is_compatible_with(labels.get_shape())
        predictions = math_ops.to_float(predictions)
        labels = math_ops.to_float(labels)
        weights = math_ops.to_float(ops.convert_to_tensor(weights))

        diffs = math_ops.subtract(predictions, labels)

        # Need to verify here since the function doesn't use compute_weighted_loss
        if diffs.get_shape().ndims is None:
            raise ValueError("diffs.get_shape().ndims cannot be None")
        if weights.get_shape().ndims is None:
            raise ValueError("weights.get_shape().ndims cannot be None")

        reduction_indices = list(range(1, diffs.get_shape().ndims))

        sum_squares_diff_per_batch = math_ops.reduce_sum(
            math_ops.square(diffs), reduction_indices=reduction_indices)
        num_present_per_batch = _num_present(diffs, weights, per_batch=True)

        term1 = 2.0 * _safe_div(sum_squares_diff_per_batch,
                                num_present_per_batch)

        sum_diff = math_ops.reduce_sum(diffs,
                                       reduction_indices=reduction_indices)
        term2 = 2.0 * _safe_div(math_ops.square(sum_diff),
                                math_ops.square(num_present_per_batch))

        loss = _scale_losses(term1 - term2, weights)

        mean_loss = array_ops.where(
            math_ops.reduce_sum(num_present_per_batch) > 0,
            loss,
            array_ops.zeros_like(loss),
            name="value")
        add_loss(mean_loss)
        return mean_loss
Example #40
0
def _SvdGrad(op, grad_s, grad_u, grad_v):
  """Gradient for the singular value decomposition."""

  # The derivation for the compute_uv=False case, and most of
  # the derivation for the full_matrices=True case, are in
  # Giles' paper (see reference at top of file).  A derivation for
  # the full_matrices=False case is available at
  # https://j-towns.github.io/papers/svd-derivative.pdf
  a = op.inputs[0]
  a_shape = a.get_shape().with_rank_at_least(2)
  grad_s_mat = array_ops.matrix_diag(grad_s)

  if not op.get_attr("compute_uv"):
    s, u, v = linalg_ops.svd(a, compute_uv=True)
    grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True))
    grad_a.set_shape(a_shape)
    return grad_a

  full_matrices = op.get_attr("full_matrices")

  # TODO(rmlarsen): Make this work with complex types.
  if a.dtype.is_complex:
    raise NotImplementedError(
        "SVD gradient is not implemented for complex types and "
        "compute_uv=True.")
  grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
  grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
  m = a_shape.dims[-2].merge_with(grad_u_shape[-2])
  n = a_shape.dims[-1].merge_with(grad_v_shape[-2])
  batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
      grad_v_shape[:-2])
  a_shape = batch_shape.concatenate([m, n])

  m = a_shape.dims[-2].value
  n = a_shape.dims[-1].value
  # TODO(rmlarsen): Make this work with placeholders.
  if m is None or n is None:
    raise NotImplementedError(
        "SVD gradient has not been implemented for input with unknown "
        "inner matrix shape.")

  s = op.outputs[0]
  u = op.outputs[1]
  v = op.outputs[2]

  use_adjoint = False
  if m > n:
    # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the
    # Hermitian transpose of the gradient at the end.
    use_adjoint = True
    m, n = n, m
    u, v = v, u
    grad_u, grad_v = grad_v, grad_u

  with ops.control_dependencies([grad_s, grad_u, grad_v]):
    if full_matrices and abs(m - n) > 1:
      raise NotImplementedError(
          "svd gradient is not implemented for abs(m - n) > 1 "
          "when full_matrices is True")
    s_mat = array_ops.matrix_diag(s)
    s2 = math_ops.square(s)

    # NOTICE: Because of the term involving f, the gradient becomes
    # infinite (or NaN in practice) when singular values are not unique.
    # Mathematically this should not be surprising, since for (k-fold)
    # degenerate singular values, the corresponding singular vectors are
    # only defined up a (k-dimensional) subspace. In practice, this can
    # lead to numerical instability when singular values are close but not
    # exactly equal.
    f = array_ops.matrix_set_diag(
        math_ops.reciprocal(
            array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
        array_ops.zeros_like(s))
    s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))

    v1 = v[..., :, :m]
    grad_v1 = grad_v[..., :, :m]

    u_gu = math_ops.matmul(u, grad_u, adjoint_a=True)
    v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True)

    f_u = f * u_gu
    f_v = f * v_gv

    term1_nouv = (
        grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) +
        math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v)))

    term1 = math_ops.matmul(u, math_ops.matmul(term1_nouv, v1, adjoint_b=True))

    if m == n:
      grad_a_before_transpose = term1
    else:
      gv1t = array_ops.matrix_transpose(grad_v1)
      gv1t_v1 = math_ops.matmul(gv1t, v1)
      term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True)

      if full_matrices:
        v2 = v[..., :, m:n]
        grad_v2 = grad_v[..., :, m:n]

        v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True)
        term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True)

      u_s_inv = math_ops.matmul(u, s_inv_mat)
      term2 = math_ops.matmul(u_s_inv, term2_nous)

      grad_a_before_transpose = term1 + term2

    if use_adjoint:
      grad_a = array_ops.matrix_transpose(grad_a_before_transpose)
    else:
      grad_a = grad_a_before_transpose

    grad_a.set_shape(a_shape)
    return grad_a
Example #41
0
 def loop_fn(i):
     x1 = array_ops.gather(x, i)
     z = array_ops.zeros_like(x1),
     return z, z + x1
Example #42
0
def _SvdGrad(op, grad_s, grad_u, grad_v):
    """Gradient for Svd based on Giles' algorithm. Reference at top of file."""
    def _Adjoint(x):
        return math_ops.conj(array_ops.matrix_transpose(x))

    if op.get_attr("compute_uv") and not op.get_attr("full_matrices"):
        raise NotImplementedError(
            "SVD gradient is not implemented for compute_uv=True and "
            "full_matrices=False.")

    a = op.inputs[0]
    a_shape = a.get_shape().with_rank_at_least(2)

    if op.get_attr("compute_uv"):
        # TODO(rmlarsen): Make this work with complex types.
        if a.dtype.is_complex:
            raise NotImplementedError(
                "SVD gradient is not implemented for complex types and "
                "compute_uv=True.")
        grad_u_shape = grad_u.get_shape().with_rank_at_least(2)
        grad_v_shape = grad_v.get_shape().with_rank_at_least(2)
        m = a_shape[-2].merge_with(grad_u_shape[-2])
        n = a_shape[-1].merge_with(grad_v_shape[-2])
        batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with(
            grad_v_shape[:-2])
        a_shape = batch_shape.concatenate([m, n])

    m = a_shape[-2].value
    n = a_shape[-1].value
    # TODO(rmlarsen): Make this work with placeholders.
    if m is None or n is None:
        raise NotImplementedError(
            "SVD gradient has not been implemented for input with unknown "
            "inner matrix shape.")

    if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"):
        s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True)
    else:
        s = op.outputs[0]
        u = op.outputs[1]
        v = op.outputs[2]

    use_adjoint = False
    if m > n:
        # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the
        # Hermitian transpose of the gradient at the end.
        use_adjoint = True
        m, n = n, m
        u, v = v, u
        grad_u, grad_v = grad_v, grad_u

    with ops.control_dependencies([grad_s, grad_u, grad_v]):
        grad_s_mat = array_ops.matrix_diag(grad_s)
        if not op.get_attr("compute_uv"):
            if use_adjoint:
                grad_a = math_ops.matmul(v[..., :, :m],
                                         math_ops.matmul(u, grad_s_mat),
                                         adjoint_b=True)
            else:
                grad_a = math_ops.matmul(
                    u,
                    math_ops.matmul(grad_s_mat, v[..., :, :m], adjoint_b=True))
            grad_a.set_shape(a_shape)
            return grad_a

        # TODO(rmlarsen): Define a gradient that is numerically stable for
        # abs(m-n) > 1. Currently this does not work because there are effectively
        # multiple singular values with value zero. I am not sure if this is a true
        # instability or if it simply throws off the finite difference gradient
        # checker.
        if abs(m - n) > 1:
            raise NotImplementedError(
                "svd gradient is not implemented for abs(m - n) > 1")
        s_mat = array_ops.matrix_diag(s)
        s2 = math_ops.square(s)

        # NOTICE: Because of the term involving f, the gradient becomes
        # infinite (or NaN in practice) when singular values are not unique.
        # Mathematically this should not be surprising, since for (k-fold)
        # degenerate singular values, the corresponding singular vectors are
        # only defined up a (k-dimensional) subspace. In practice, this can
        # lead to numerical instability when singular values are close but not
        # exactly equal.
        f = array_ops.matrix_set_diag(
            math_ops.reciprocal(
                array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)),
            array_ops.zeros_like(s))
        s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s))
        u_gu = math_ops.matmul(u, grad_u, adjoint_a=True)
        v_gv = math_ops.matmul(v, grad_v, adjoint_a=True)

        if m == n:
            f_u = f * u_gu
            f_v = f * v_gv
        else:
            dv2 = array_ops.matrix_transpose(
                v_gv[..., m:n, :m]) - v_gv[..., :m, m:n]
            f_u = f * u_gu
            f_v = f * v_gv[..., :m, :m]

        grad_a_nouv = (grad_s_mat +
                       math_ops.matmul(f_u + _Adjoint(f_u), s_mat) +
                       math_ops.matmul(s_mat, f_v + _Adjoint(f_v)))

        if m != n:
            grad_a_nouv = array_ops.concat(
                [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1)

        if use_adjoint:
            # Use (U X V^H)^H = V (U X)^H.
            grad_a = math_ops.matmul(v,
                                     math_ops.matmul(u, grad_a_nouv),
                                     adjoint_b=True)
        else:
            grad_a = math_ops.matmul(
                u, math_ops.matmul(grad_a_nouv, v, adjoint_b=True))

        grad_a.set_shape(a_shape)
        return grad_a
Example #43
0
def dynamic_decode(decoder,
                   output_time_major=False,
                   impute_finished=False,
                   maximum_iterations=None,
                   parallel_iterations=32,
                   swap_memory=False,
                   scope=None):
  """Perform dynamic decoding with `decoder`.
  Calls initialize() once and step() repeatedly on the Decoder object.
  Args:
    decoder: A `Decoder` instance.
    output_time_major: Python boolean.  Default: `False` (batch major).  If
      `True`, outputs are returned as time major tensors (this mode is faster).
      Otherwise, outputs are returned as batch major tensors (this adds extra
      time to the computation).
    impute_finished: Python boolean.  If `True`, then states for batch
      entries which are marked as finished get copied through and the
      corresponding outputs get zeroed out.  This causes some slowdown at
      each time step, but ensures that the final state and outputs have
      the correct values and that backprop ignores time steps that were
      marked as finished.
    maximum_iterations: `int32` scalar, maximum allowed number of decoding
       steps.  Default is `None` (decode until the decoder is fully done).
    parallel_iterations: Argument passed to `tf.while_loop`.
    swap_memory: Argument passed to `tf.while_loop`.
    scope: Optional variable scope to use.
  Returns:
    `(final_outputs, final_state, final_sequence_lengths)`.
  Raises:
    TypeError: if `decoder` is not an instance of `Decoder`.
    ValueError: if `maximum_iterations` is provided but is not a scalar.
  """
  if not isinstance(decoder, tf.contrib.seq2seq.Decoder):
    raise TypeError("Expected decoder to be type Decoder, but saw: %s" %
                    type(decoder))

  with variable_scope.variable_scope(scope, "decoder") as varscope:
    # Properly cache variable values inside the while_loop
    if varscope.caching_device is None:
      varscope.set_caching_device(lambda op: op.device)

    if maximum_iterations is not None:
      maximum_iterations = ops.convert_to_tensor(
          maximum_iterations, dtype=dtypes.int32, name="maximum_iterations")
      if maximum_iterations.get_shape().ndims != 0:
        raise ValueError("maximum_iterations must be a scalar")

    initial_finished, initial_inputs, initial_state, hist = decoder.initialize()

    zero_outputs = _create_zero_outputs(decoder.output_size,
                                        decoder.output_dtype,
                                        decoder.batch_size)

    if maximum_iterations is not None:
      initial_finished = math_ops.logical_or(
          initial_finished, 0 >= maximum_iterations)
    initial_sequence_lengths = array_ops.zeros_like(
        initial_finished, dtype=dtypes.int32)
    initial_time = constant_op.constant(0, dtype=dtypes.int32)

    def _shape(batch_size, from_shape):
      if not isinstance(from_shape, tensor_shape.TensorShape):
        return tensor_shape.TensorShape(None)
      else:
        batch_size = tensor_util.constant_value(
            ops.convert_to_tensor(
                batch_size, name="batch_size"))
        return tensor_shape.TensorShape([batch_size]).concatenate(from_shape)

    def _create_ta(s, d):
      return tensor_array_ops.TensorArray(
          dtype=d,
          size=0,
          dynamic_size=True,
          element_shape=_shape(decoder.batch_size, s))

    initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size,
                                            decoder.output_dtype)

    def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs,
                  finished, unused_sequence_lengths, unused_hist):
      return math_ops.logical_not(math_ops.reduce_all(finished))

    def body(time, outputs_ta, state, inputs, finished, sequence_lengths, hist):
      """Internal while_loop body.
      Args:
        time: scalar int32 tensor.
        outputs_ta: structure of TensorArray.
        state: (structure of) state tensors and TensorArrays.
        inputs: (structure of) input tensors.
        finished: bool tensor (keeping track of what's finished).
        sequence_lengths: int32 tensor (keeping track of time of finish).
      Returns:
        `(time + 1, outputs_ta, next_state, next_inputs, next_finished,
          next_sequence_lengths)`.
        ```
      """
      (next_outputs, decoder_state, next_inputs,
       decoder_finished, hist) = decoder.step(time, inputs, state, hist)
      if decoder.tracks_own_finished:
        next_finished = decoder_finished
      else:
        next_finished = math_ops.logical_or(decoder_finished, finished)
      if maximum_iterations is not None:
        next_finished = math_ops.logical_or(
            next_finished, time + 1 >= maximum_iterations)
      next_sequence_lengths = array_ops.where(
          math_ops.logical_and(math_ops.logical_not(finished), next_finished),
          array_ops.fill(array_ops.shape(sequence_lengths), time + 1),
          sequence_lengths)

      nest.assert_same_structure(state, decoder_state)
      nest.assert_same_structure(outputs_ta, next_outputs)
      nest.assert_same_structure(inputs, next_inputs)

      # Zero out output values past finish
      if impute_finished:
        emit = nest.map_structure(
            lambda out, zero: array_ops.where(finished, zero, out),
            next_outputs,
            zero_outputs)
      else:
        emit = next_outputs

      # Copy through states past finish
      def _maybe_copy_state(new, cur):
        # TensorArrays and scalar states get passed through.
        if isinstance(cur, tensor_array_ops.TensorArray):
          pass_through = True
        else:
          new.set_shape(cur.shape)
          pass_through = (new.shape.ndims == 0)
        return new if pass_through else array_ops.where(finished, cur, new)

      if impute_finished:
        next_state = nest.map_structure(
            _maybe_copy_state, decoder_state, state)
      else:
        next_state = decoder_state

      outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out),
                                      outputs_ta, emit)
      return (time + 1, outputs_ta, next_state, next_inputs, next_finished,
              next_sequence_lengths, hist)

    res = control_flow_ops.while_loop(
        condition,
        body,
        loop_vars=[
            initial_time, initial_outputs_ta, initial_state, initial_inputs,
            initial_finished, initial_sequence_lengths, hist
        ],
        parallel_iterations=parallel_iterations,
        swap_memory=swap_memory)

    final_outputs_ta = res[1]
    final_state = res[2]
    final_sequence_lengths = res[5]

    final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta)

    try:
      final_outputs, final_state = decoder.finalize(
          final_outputs, final_state, final_sequence_lengths)
    except NotImplementedError:
      pass

    if not output_time_major:
      final_outputs = nest.map_structure(_transpose_batch_time, final_outputs)

  return final_outputs, final_state, final_sequence_lengths
Example #44
0
def histogram_fixed_width(values,
                          value_range,
                          nbins=100,
                          use_locking=True,
                          dtype=dtypes.int32,
                          name=None):
    """Return histogram of values.

  Given the tensor `values`, this operation returns a rank 1 histogram counting
  the number of entries in `values` that fell into every bin.  The bins are
  equal width and determined by the arguments `value_range` and `nbins`.

  Args:
    values:  Numeric `Tensor`.
    value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
      mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
      Must be same dtype as new_values.
    nbins:  Integer number of bins in this histogram.
    use_locking:  Boolean.
      If `True`, use locking during the operation (optional).
    dtype:  dtype for returned histogram.
    name:  A name for this operation (defaults to 'histogram_fixed_width').

  Returns:
    A `Variable` holding histogram of values.

  Examples:
  ```python
  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
  nbins = 5
  value_range = [0.0, 5.0]
  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]

  with tf.default_session() as sess:
    hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
    variables.initialize_all_variables().run()
    sess.run(hist) => [2, 1, 1, 0, 2]
  ```
  """
    with variable_scope.variable_op_scope([values, value_range], name,
                                          'histogram_fixed_width') as scope:
        values = ops.convert_to_tensor(values, name='values')
        values = array_ops.reshape(values, [-1])
        value_range = ops.convert_to_tensor(value_range, name='value_range')

        # Map tensor values that fall within value_range to [0, 1].
        scaled_values = math_ops.truediv(values - value_range[0],
                                         value_range[1] - value_range[0],
                                         name='scaled_values')

        # map tensor values within the open interval value_range to {0,.., nbins-1},
        # values outside the open interval will be zero or less, or nbins or more.
        indices = math_ops.floor(nbins * scaled_values, name='indices')

        # Clip edge cases (e.g. value = value_range[1]) or "outliers."
        indices = math_ops.cast(clip_ops.clip_by_value(indices, 0, nbins - 1),
                                dtypes.int32)

        # Dummy vector to scatter.
        # TODO(langmore) Replace non-ideal creation of large dummy vector once an
        # alternative to scatter is available.
        updates = array_ops.ones_like(indices, dtype=dtype)

        hist = variable_scope.get_variable(
            'hist',
            initializer=array_ops.zeros_initializer([nbins], dtype=dtype),
            trainable=False)
        hist_assign_zero = hist.assign(array_ops.zeros_like(hist))

        with ops.control_dependencies([hist_assign_zero]):
            return state_ops.scatter_add(hist,
                                         indices,
                                         updates,
                                         use_locking=use_locking,
                                         name=scope.name)
Example #45
0
def sigmoid_cross_entropy_with_logits(logits, targets, name=None):
    """Computes sigmoid cross entropy given `logits`.

  Measures the probability error in discrete classification tasks in which each
  class is independent and not mutually exclusive.  For instance, one could
  perform multilabel classification where a picture can contain both an elephant
  and a dog at the same time.

  For brevity, let `x = logits`, `z = targets`.  The logistic loss is

        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
      = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
      = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
      = (1 - z) * x + log(1 + exp(-x))
      = x - x * z + log(1 + exp(-x))

  For x < 0, to avoid overflow in exp(-x), we reformulate the above

        x - x * z + log(1 + exp(-x))
      = log(exp(x)) - x * z + log(1 + exp(-x))
      = - x * z + log(1 + exp(x))

  Hence, to ensure stability and avoid overflow, the implementation uses this
  equivalent formulation

      max(x, 0) - x * z + log(1 + exp(-abs(x)))

  `logits` and `targets` must have the same type and shape.

  Args:
    logits: A `Tensor` of type `float32` or `float64`.
    targets: A `Tensor` of the same type and shape as `logits`.
    name: A name for the operation (optional).

  Returns:
    A `Tensor` of the same shape as `logits` with the componentwise
    logistic losses.

  Raises:
    ValueError: If `logits` and `targets` do not have the same shape.
  """
    with ops.name_scope(name, "logistic_loss", [logits, targets]) as name:
        logits = ops.convert_to_tensor(logits, name="logits")
        targets = ops.convert_to_tensor(targets, name="targets")
        try:
            targets.get_shape().merge_with(logits.get_shape())
        except ValueError:
            raise ValueError(
                "logits and targets must have the same shape (%s vs %s)" %
                (logits.get_shape(), targets.get_shape()))

        # The logistic loss formula from above is
        #   x - x * z + log(1 + exp(-x))
        # For x < 0, a more numerically stable formula is
        #   -x * z + log(1 + exp(x))
        # Note that these two expressions can be combined into the following:
        #   max(x, 0) - x * z + log(1 + exp(-abs(x)))
        # To allow computing gradients at zero, we define custom versions of max and
        # abs functions.
        zeros = array_ops.zeros_like(logits, dtype=logits.dtype)
        cond = (logits >= zeros)
        relu_logits = math_ops.select(cond, logits, zeros)
        neg_abs_logits = math_ops.select(cond, -logits, logits)
        return math_ops.add(relu_logits - logits * targets,
                            math_ops.log1p(math_ops.exp(neg_abs_logits)),
                            name=name)
Example #46
0
def _ComplexAbsGrad(op, grad):
    """Returns the gradient of ComplexAbs."""
    # TODO(b/27786104): The cast to complex could be removed once arithmetic
    # supports mixtures of complex64 and real values.
    return (math_ops.complex(grad, array_ops.zeros_like(grad)) *
            math_ops.sign(op.inputs[0]))
Example #47
0
    def call(self, inputs, training=None):
        original_training_value = training
        if training is None:
            training = K.learning_phase()

        in_eager_mode = context.executing_eagerly()
        if self.virtual_batch_size is not None:
            # Virtual batches (aka ghost batches) can be simulated by reshaping the
            # Tensor and reusing the existing batch norm implementation
            original_shape = [-1] + inputs.shape.as_list()[1:]
            expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

            # Will cause errors if virtual_batch_size does not divide the batch size
            inputs = array_ops.reshape(inputs, expanded_shape)

            def undo_virtual_batching(outputs):
                outputs = array_ops.reshape(outputs, original_shape)
                return outputs

        if self.fused:
            outputs = self._fused_batch_norm(inputs, training=training)
            if self.virtual_batch_size is not None:
                # Currently never reaches here since fused_batch_norm does not support
                # virtual batching
                outputs = undo_virtual_batching(outputs)
            if not context.executing_eagerly(
            ) and original_training_value is None:
                outputs._uses_learning_phase = True  # pylint: disable=protected-access
            return outputs

        # Compute the axes along which to reduce the mean / variance
        input_shape = inputs.get_shape()
        ndims = len(input_shape)
        reduction_axes = [i for i in range(ndims) if i not in self.axis]
        if self.virtual_batch_size is not None:
            del reduction_axes[1]  # Do not reduce along virtual batch dim

        # Broadcasting only necessary for single-axis batch norm where the axis is
        # not the last dimension
        broadcast_shape = [1] * ndims
        broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value

        def _broadcast(v):
            if (v is not None and len(v.get_shape()) != ndims
                    and reduction_axes != list(range(ndims - 1))):
                return array_ops.reshape(v, broadcast_shape)
            return v

        scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

        def _compose_transforms(scale, offset, then_scale, then_offset):
            if then_scale is not None:
                scale *= then_scale
                offset *= then_scale
            if then_offset is not None:
                offset += then_offset
            return (scale, offset)

        # Determine a boolean value for `training`: could be True, False, or None.
        training_value = tf_utils.constant_value(training)
        if training_value is not False:
            if self.adjustment:
                adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
                # Adjust only during training.
                adj_scale = tf_utils.smart_cond(
                    training, lambda: adj_scale,
                    lambda: array_ops.ones_like(adj_scale))
                adj_bias = tf_utils.smart_cond(
                    training, lambda: adj_bias,
                    lambda: array_ops.zeros_like(adj_bias))
                scale, offset = _compose_transforms(adj_scale, adj_bias, scale,
                                                    offset)

            # Some of the computations here are not necessary when training==False
            # but not a constant. However, this makes the code simpler.
            keep_dims = self.virtual_batch_size is not None or len(
                self.axis) > 1
            mean, variance = nn.moments(inputs,
                                        reduction_axes,
                                        keep_dims=keep_dims)

            moving_mean = self.moving_mean
            moving_variance = self.moving_variance

            mean = tf_utils.smart_cond(training, lambda: mean,
                                       lambda: moving_mean)
            variance = tf_utils.smart_cond(training, lambda: variance,
                                           lambda: moving_variance)

            if self.renorm:
                r, d, new_mean, new_variance = self._renorm_correction_and_moments(
                    mean, variance, training)
                # When training, the normalized values (say, x) will be transformed as
                # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
                # = x * (r * gamma) + (d * gamma + beta) with renorm.
                r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
                d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
                scale, offset = _compose_transforms(r, d, scale, offset)
            else:
                new_mean, new_variance = mean, variance

            if self.virtual_batch_size is not None:
                # This isn't strictly correct since in ghost batch norm, you are
                # supposed to sequentially update the moving_mean and moving_variance
                # with each sub-batch. However, since the moving statistics are only
                # used during evaluation, it is more efficient to just update in one
                # step and should not make a significant difference in the result.
                new_mean = math_ops.reduce_mean(new_mean,
                                                axis=1,
                                                keepdims=True)
                new_variance = math_ops.reduce_mean(new_variance,
                                                    axis=1,
                                                    keepdims=True)

            def _do_update(var, value):
                if in_eager_mode and not self.trainable:
                    return

                return self._assign_moving_average(var, value, self.momentum)

            mean_update = tf_utils.smart_cond(
                training, lambda: _do_update(self.moving_mean, new_mean),
                lambda: self.moving_mean)
            variance_update = tf_utils.smart_cond(
                training,
                lambda: _do_update(self.moving_variance, new_variance),
                lambda: self.moving_variance)
            if not context.executing_eagerly():
                self.add_update(mean_update, inputs=True)
                self.add_update(variance_update, inputs=True)

        else:
            mean, variance = self.moving_mean, self.moving_variance

        outputs = nn.batch_normalization(inputs, _broadcast(mean),
                                         _broadcast(variance), offset, scale,
                                         self.epsilon)
        # If some components of the shape got lost due to adjustments, fix that.
        outputs.set_shape(input_shape)

        if self.virtual_batch_size is not None:
            outputs = undo_virtual_batching(outputs)
        if not context.executing_eagerly() and original_training_value is None:
            outputs._uses_learning_phase = True  # pylint: disable=protected-access
        return outputs
Example #48
0
def _compute_sampled_logits(weights,
                            biases,
                            inputs,
                            labels,
                            num_sampled,
                            num_classes,
                            num_true=1,
                            sampled_values=None,
                            subtract_log_q=True,
                            remove_accidental_hits=False,
                            partition_strategy="mod",
                            name=None):

    if not isinstance(weights, list):
        weights = [weights]

    with ops.name_scope(name, "compute_sampled_logits",
                        weights + [biases, inputs, labels]):
        if labels.dtype != dtypes.int64:
            labels = math_ops.cast(labels, dtypes.int64)
        labels_flat = array_ops.reshape(labels, [-1])
        # Sample the negative labels.
        #   sampled shape: [num_sampled] tensor
        #   true_expected_count shape = [batch_size, 1] tensor
        #   sampled_expected_count shape = [num_sampled] tensor
        if sampled_values is None:
            sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler(
                true_classes=labels,
                num_true=num_true,
                num_sampled=num_sampled,
                unique=True,
                range_max=num_classes)
        # NOTE: pylint cannot tell that 'sampled_values' is a sequence
        # pylint: disable=unpacking-non-sequence
        sampled, true_expected_count, sampled_expected_count = sampled_values
        # pylint: enable=unpacking-non-sequence

        # labels_flat is a [batch_size * num_true] tensor
        # sampled is a [num_sampled] int tensor
        all_ids = array_ops.concat(0, [labels_flat, sampled])

        # weights shape is [num_classes, dim]
        all_w = embedding_ops.embedding_lookup(
            weights, all_ids, partition_strategy=partition_strategy)
        all_b = embedding_ops.embedding_lookup(biases, all_ids)
        # true_w shape is [batch_size * num_true, dim]
        # true_b is a [batch_size * num_true] tensor
        true_w = array_ops.slice(all_w, [0, 0],
                                 array_ops.pack(
                                     [array_ops.shape(labels_flat)[0],
                                      -1]))  # 128*128
        true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat))

        # inputs shape is [batch_size, dim]
        # true_w shape is [batch_size * num_true, dim]
        # row_wise_dots is [batch_size, num_true, dim]
        dim = array_ops.shape(true_w)[1:2]
        new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim])
        row_wise_dots = math_ops.mul(
            array_ops.expand_dims(inputs, 1),  # 128*1*128
            array_ops.reshape(true_w, new_true_w_shape))  # 128*1*128
        # We want the row-wise dot plus biases which yields a
        # [batch_size, num_true] tensor of true_logits.
        dots_as_matrix = array_ops.reshape(row_wise_dots,
                                           array_ops.concat(0, [[-1], dim]))
        true_logits = array_ops.reshape(_sum_rows(dots_as_matrix),
                                        [-1, num_true])
        true_b = array_ops.reshape(true_b, [-1, num_true])
        true_logits += true_b

        # Lookup weights and biases for sampled labels.
        #   sampled_w shape is [num_sampled, dim]
        #   sampled_b is a [num_sampled] float tensor
        sampled_w = array_ops.slice(
            all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]),
            [-1, -1])
        sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1])

        # inputs has shape [batch_size, dim]
        # sampled_w has shape [num_sampled, dim]
        # sampled_b has shape [num_sampled]
        # Apply X*W'+B, which yields [batch_size, num_sampled]
        sampled_logits = math_ops.matmul(inputs, sampled_w,
                                         transpose_b=True) + sampled_b
        if remove_accidental_hits:
            acc_hits = candidate_sampling_ops.compute_accidental_hits(
                labels, sampled, num_true=num_true)
            acc_indices, acc_ids, acc_weights = acc_hits

            # This is how SparseToDense expects the indices.
            acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1])
            acc_ids_2d_int32 = array_ops.reshape(
                math_ops.cast(acc_ids, dtypes.int32), [-1, 1])
            sparse_indices = array_ops.concat(
                1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices")
            # Create sampled_logits_shape = [batch_size, num_sampled]
            sampled_logits_shape = array_ops.concat(0, [
                array_ops.shape(labels)[:1],
                array_ops.expand_dims(num_sampled, 0)
            ])
            if sampled_logits.dtype != acc_weights.dtype:
                acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype)
            sampled_logits += sparse_ops.sparse_to_dense(
                sparse_indices,
                sampled_logits_shape,
                acc_weights,
                default_value=0.0,
                validate_indices=False)

        if subtract_log_q:
            # Subtract log of Q(l), prior probability that l appears in sampled.
            true_logits -= math_ops.log(true_expected_count)
            sampled_logits -= math_ops.log(sampled_expected_count)

        # Construct output logits and labels. The true labels/logits start at col 0.
        out_logits = array_ops.concat(1, [true_logits, sampled_logits])
        # true_logits is a float tensor, ones_like(true_logits) is a float tensor
        # of ones. We then divide by num_true to ensure the per-example labels sum
        # to 1.0, i.e. form a proper probability distribution.
        out_labels = array_ops.concat(1, [
            array_ops.ones_like(true_logits) / num_true,
            array_ops.zeros_like(sampled_logits)
        ])
    return out_logits, out_labels
Example #49
0
    def get_dp_train_ops(self, epsilon, data_size, learning_rate=0.1):
        # compute Laplace noise injected into coefficients h
        Delta = self.n_in * (self.n_out + 1 / 4 * self.n_out**2)
        perturbFM = np.random.laplace(0.0, Delta / (epsilon * data_size),
                                      self.n_out)
        perturbFM = np.reshape(perturbFM, [self.n_out])

        # compute h terms
        activation_h = self.propup(self.input) + perturbFM
        # reconstruct v terms
        activation_v = self.propdown(activation_h)
        '''
        Computes sigmoid cross entropy given `logits`, i.e., x = logits = activation_v, and z = self.input = labels.
            
        Measures the probability error in discrete classification tasks in which each
        class, i.e., each visible neuron, is independent and not mutually exclusive.
            
        The logistic loss is
        z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x))
        = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x)))
        = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x))
        = (1 - z) * x + log(1 + exp(-x))
        = x - x * z + log(1 + exp(-x))
            
        For x < 0, to avoid overflow in exp(-x), we reformulate the above
            
        x - x * z + log(1 + exp(-x))
        = log(exp(x)) - x * z + log(1 + exp(-x))
        = - x * z + log(1 + exp(x))
            
        Hence, to ensure stability and avoid overflow, the implementation uses this
        equivalent formulation
            
        max(x, 0) - x * z + log(1 + exp(-abs(x)))
            
        `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(activation_v) = -abs(W^T * activation_h). By Applying Taylor Expansion, we have:
            
        Taylor = max(activation_v, 0) - activation_v * self.input + log(1 + exp(-abs(activation_v)));
            = max(activation_h * W^T, 0) -  self.input * (activation_h * W^T) + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2)
            = max(activation_h * W^T, 0) -  self.input * (activation_h * W^T) + (math.log(2.0) + 0.5*(-abs(activation_h * W^T)) + 1.0/8.0*(-abs(activation_h * W^T))**2)
            
        To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the terms activation_h * W^T, self.input * (W^T * activation_h).
            
        Since 'self.input *' is an element-wise multiplication, activation_h can be considered coefficients of the term self.input * (W^T * activation_h). By applying Funtional Mechanism, we perturb self.input * (W^T * activation_h) as tf.matmul(activation_h + perturbFM, W^T) * self.input:
            
        activation_h += perturbFM; # activation_h = self.propup(self.input) + perturbFM; # (Lemma 2) where
        
        Delta = self.n_in*(self.n_out + 1/4 * self.n_out**2);
        perturbFM = np.random.laplace(0.0, Delta/(epsilon*data_size), self.n_out)
        perturbFM = np.reshape(perturbFM, [self.n_out]);
            
        To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow].
            
        Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow
        '''

        zeros = array_ops.zeros_like(activation_v, dtype=self.input.dtype)
        cond = (activation_v >= zeros)
        relu_logits = array_ops.where(cond, activation_v, zeros)
        neg_abs_logits = array_ops.where(cond, -activation_v, activation_v)
        #Taylor = math_ops.add(relu_logits - activation_v * self.input, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2)
        self.cost = math_ops.add(
            relu_logits - activation_v * self.input,
            math.log(2.0) + 0.5 * neg_abs_logits +
            1.0 / 8.0 * neg_abs_logits**2)
        optimizer = tf.train.AdamOptimizer(learning_rate).minimize(
            self.cost, var_list=self.params)

        return optimizer
Example #50
0
def safe_embedding_lookup_sparse(embedding_weights,
                                 sparse_ids,
                                 sparse_weights=None,
                                 combiner="mean",
                                 default_id=None,
                                 name=None,
                                 partition_strategy="div",
                                 max_norm=None):
  """Lookup embedding results, accounting for invalid IDs and empty features.

  The partitioned embedding in `embedding_weights` must all be the same shape
  except for the first dimension. The first dimension is allowed to vary as the
  vocabulary size is not necessarily a multiple of `P`.  `embedding_weights`
  may be a `PartitionedVariable` as returned by using
  `tf.compat.v1.get_variable()` with a
  partitioner.

  Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs
  with non-positive weight. For an entry with no features, the embedding vector
  for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

  The ids and weights may be multi-dimensional. Embeddings are always aggregated
  along the last dimension.

  Args:
    embedding_weights:  A list of `P` float `Tensor`s or values representing
      partitioned embedding `Tensor`s.  Alternatively, a `PartitionedVariable`
      created by partitioning along dimension 0.  The total unpartitioned shape
      should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size
      and `e_1, ..., e_m` are the embedding dimensions.
    sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
      ids. `d_0` is typically batch size.
    sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
      float weights corresponding to `sparse_ids`, or `None` if all weights are
      be assumed to be 1.0.
    combiner: A string specifying how to combine embedding results for each
      entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
      default.
    default_id: The id to use for an entry with no features.
    name: A name for this operation (optional).
    partition_strategy: A string specifying the partitioning strategy. Currently
      `"div"` and `"mod"` are supported. Default is `"div"`.
    max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
      combining.

  Returns:
    Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.

  Raises:
    ValueError: if `embedding_weights` is empty.
  """
  if embedding_weights is None:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)
  if isinstance(embedding_weights, variables.PartitionedVariable):
    embedding_weights = list(embedding_weights)  # get underlying Variables.
  if not isinstance(embedding_weights, list):
    embedding_weights = [embedding_weights]
  if len(embedding_weights) < 1:
    raise ValueError("Missing embedding_weights %s." % embedding_weights)

  dtype = sparse_weights.dtype if sparse_weights is not None else None
  embedding_weights = [
      w if (isinstance(w, resource_variable_ops.ResourceVariable)
            and dtype in (None, w.dtype))
      else ops.convert_to_tensor(w, dtype=dtype)
      for w in embedding_weights
  ]

  with ops.name_scope(name, "embedding_lookup", embedding_weights +
                      [sparse_ids, sparse_weights]) as scope:
    # Reshape higher-rank sparse ids and weights to linear segment ids.
    original_shape = sparse_ids.dense_shape
    original_rank_dim = tensor_shape.dimension_value(
        sparse_ids.dense_shape.get_shape()[0])
    original_rank = (
        array_ops.size(original_shape)
        if original_rank_dim is None else original_rank_dim)
    sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [
        math_ops.reduce_prod(
            array_ops.slice(original_shape, [0], [original_rank - 1])),
        array_ops.gather(original_shape, original_rank - 1)
    ])
    if sparse_weights is not None:
      sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices,
                                                  sparse_weights.values,
                                                  sparse_ids.dense_shape)

    # Prune invalid ids and weights.
    sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights)
    if combiner != "sum":
      sparse_ids, sparse_weights = _prune_invalid_weights(
          sparse_ids, sparse_weights)

    # Fill in dummy values for empty features, if necessary.
    sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(
        sparse_ids, default_id or 0)
    if sparse_weights is not None:
      sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0)

    result = embedding_lookup_sparse(
        embedding_weights,
        sparse_ids,
        sparse_weights,
        combiner=combiner,
        partition_strategy=partition_strategy,
        name=None if default_id is None else scope,
        max_norm=max_norm)

    if default_id is None:
      # Broadcast is_row_empty to the same shape as embedding_lookup_result,
      # for use in Select.
      is_row_empty = array_ops.tile(
          array_ops.reshape(is_row_empty, [-1, 1]),
          array_ops.stack([1, array_ops.shape(result)[1]]))

      result = array_ops.where(
          is_row_empty, array_ops.zeros_like(result), result, name=scope)

    # Reshape back from linear ids back into higher-dimensional dense result.
    final_result = array_ops.reshape(
        result,
        array_ops.concat([
            array_ops.slice(
                math_ops.cast(original_shape, dtypes.int32), [0],
                [original_rank - 1]),
            array_ops.slice(array_ops.shape(result), [1], [-1])
        ], 0))
    final_result.set_shape(
        tensor_shape.unknown_shape(
            (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(
                result.get_shape()[1:]))
    return final_result
Example #51
0
def batch_matrix_pow(matrices, powers):
  """Compute powers of matrices, e.g. A^3 = matmul(matmul(A, A), A).

  Uses exponentiation by squaring, with O(log(p)) matrix multiplications to
  compute A^p.

  Args:
    matrices: [batch size x N x N]
    powers: Which integer power to raise each matrix to [batch size]
  Returns:
    The matrices raised to their respective powers, same dimensions as the
    "matrices" argument.
  """

  def terminate_when_all_zero(current_argument, residual_powers, accumulator):
    del current_argument, accumulator  # not used for condition
    do_exit = math_ops.reduce_any(
        math_ops.greater(residual_powers, array_ops.ones_like(residual_powers)))
    return do_exit

  def do_iteration(current_argument, residual_powers, accumulator):
    """Compute one step of iterative exponentiation by squaring.

    The recursive form is:
      power(A, p) = { power(matmul(A, A), p / 2) for even p
                    { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p
      power(A, 0) = I

    The power(A, 0) = I case is handled by starting with accumulator set to the
    identity matrix; matrices with zero residual powers are passed through
    unchanged.

    Args:
      current_argument: On this step, what is the first argument (A^2..^2) to
          the (unrolled) recursive function? [batch size x N x N]
      residual_powers: On this step, what is the second argument (residual p)?
          [batch_size]
      accumulator: Accumulates the exterior multiplications from the odd
          powers (initially the identity matrix). [batch_size x N x N]
    Returns:
      Updated versions of each argument for one step of the unrolled
      computation. Does not change parts of the batch which have a residual
      power of zero.
    """
    is_even = math_ops.equal(residual_powers % 2,
                             array_ops.zeros(
                                 array_ops.shape(residual_powers),
                                 dtype=dtypes.int32))
    new_accumulator = array_ops.where(is_even, accumulator,
                                      math_ops.matmul(accumulator,
                                                      current_argument))
    new_argument = math_ops.matmul(current_argument, current_argument)
    do_update = math_ops.greater(residual_powers, 1)
    new_residual_powers = residual_powers - residual_powers % 2
    new_residual_powers //= 2
    # Stop updating if we've reached our base case; some batch elements may
    # finish sooner than others
    accumulator = array_ops.where(do_update, new_accumulator, accumulator)
    current_argument = array_ops.where(do_update, new_argument,
                                       current_argument)
    residual_powers = array_ops.where(do_update, new_residual_powers,
                                      residual_powers)
    return (current_argument, residual_powers, accumulator)

  matrices = ops.convert_to_tensor(matrices)
  powers = math_ops.cast(powers, dtype=dtypes.int32)
  ident = array_ops.expand_dims(
      array_ops.diag(
          array_ops.ones([array_ops.shape(matrices)[1]], dtype=matrices.dtype)),
      0)
  ident_tiled = array_ops.tile(ident, [array_ops.shape(matrices)[0], 1, 1])
  (final_argument,
   final_residual_power, final_accumulator) = control_flow_ops.while_loop(
       terminate_when_all_zero, do_iteration, [matrices, powers, ident_tiled])
  return array_ops.where(
      math_ops.equal(final_residual_power,
                     array_ops.zeros_like(
                         final_residual_power, dtype=dtypes.int32)),
      ident_tiled, math_ops.matmul(final_argument, final_accumulator))
Example #52
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = tf_utils.smart_cond(training, lambda: r,
                                lambda: array_ops.ones_like(r))
        d = tf_utils.smart_cond(training, lambda: d,
                                lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                """Updates the var and weight, returns their updated ratio."""
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = self._assign_moving_average(var, value,
                                                      self.renorm_momentum)
                new_weight = self._assign_moving_average(
                    weight, weight_value, self.renorm_momentum)
                # TODO(yuefengz): the updates to var and weighted can not be batched
                # together if we fetch their updated values here. Consider calculating
                # new values and delaying the updates.
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return tf_utils.smart_cond(training, _do_update, _fake_update)

        # TODO(yuefengz): colocate the operations
        new_mean = _update_renorm_variable(self.renorm_mean,
                                           self.renorm_mean_weight, mean)
        new_stddev = _update_renorm_variable(self.renorm_stddev,
                                             self.renorm_stddev_weight, stddev)
        # Make sqrt(moving_variance + epsilon) = new_stddev.
        new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Example #53
0
  def call(self, inputs, training=None):
    training = self._get_training_value(training)

    if self.virtual_batch_size is not None:
      # Virtual batches (aka ghost batches) can be simulated by reshaping the
      # Tensor and reusing the existing batch norm implementation
      original_shape = [-1] + inputs.shape.as_list()[1:]
      expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:]

      # Will cause errors if virtual_batch_size does not divide the batch size
      inputs = array_ops.reshape(inputs, expanded_shape)

      def undo_virtual_batching(outputs):
        outputs = array_ops.reshape(outputs, original_shape)
        return outputs

    if self.fused:
      outputs = self._fused_batch_norm(inputs, training=training)
      if self.virtual_batch_size is not None:
        # Currently never reaches here since fused_batch_norm does not support
        # virtual batching
        outputs = undo_virtual_batching(outputs)
      return outputs

    # Compute the axes along which to reduce the mean / variance
    input_shape = inputs.shape
    ndims = len(input_shape)
    reduction_axes = [i for i in range(ndims) if i not in self.axis]
    if self.virtual_batch_size is not None:
      del reduction_axes[1]     # Do not reduce along virtual batch dim

    # Broadcasting only necessary for single-axis batch norm where the axis is
    # not the last dimension
    broadcast_shape = [1] * ndims
    broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value
    def _broadcast(v):
      if (v is not None and len(v.shape) != ndims and
          reduction_axes != list(range(ndims - 1))):
        return array_ops.reshape(v, broadcast_shape)
      return v

    scale, offset = _broadcast(self.gamma), _broadcast(self.beta)

    def _compose_transforms(scale, offset, then_scale, then_offset):
      if then_scale is not None:
        scale *= then_scale
        offset *= then_scale
      if then_offset is not None:
        offset += then_offset
      return (scale, offset)

    # Determine a boolean value for `training`: could be True, False, or None.
    training_value = tf_utils.constant_value(training)
    if training_value == False:  # pylint: disable=singleton-comparison,g-explicit-bool-comparison
      mean, variance = self.moving_mean, self.moving_variance
    else:
      if self.adjustment:
        adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs))
        # Adjust only during training.
        adj_scale = tf_utils.smart_cond(training,
                                        lambda: adj_scale,
                                        lambda: array_ops.ones_like(adj_scale))
        adj_bias = tf_utils.smart_cond(training,
                                       lambda: adj_bias,
                                       lambda: array_ops.zeros_like(adj_bias))
        scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset)

      # Some of the computations here are not necessary when training==False
      # but not a constant. However, this makes the code simpler.
      keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1
      mean, variance = self._moments(
          math_ops.cast(inputs, self._param_dtype),
          reduction_axes,
          keep_dims=keep_dims)

      moving_mean = self.moving_mean
      moving_variance = self.moving_variance

      mean = tf_utils.smart_cond(training,
                                 lambda: mean,
                                 lambda: ops.convert_to_tensor(moving_mean))
      variance = tf_utils.smart_cond(
          training,
          lambda: variance,
          lambda: ops.convert_to_tensor(moving_variance))

      if self.virtual_batch_size is not None:
        # This isn't strictly correct since in ghost batch norm, you are
        # supposed to sequentially update the moving_mean and moving_variance
        # with each sub-batch. However, since the moving statistics are only
        # used during evaluation, it is more efficient to just update in one
        # step and should not make a significant difference in the result.
        new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True)
        new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True)
      else:
        new_mean, new_variance = mean, variance

      if self._support_zero_size_input():
        inputs_size = array_ops.size(inputs)
      else:
        inputs_size = None
      if self.renorm:
        r, d, new_mean, new_variance = self._renorm_correction_and_moments(
            new_mean, new_variance, training, inputs_size)
        # When training, the normalized values (say, x) will be transformed as
        # x * gamma + beta without renorm, and (x * r + d) * gamma + beta
        # = x * (r * gamma) + (d * gamma + beta) with renorm.
        r = _broadcast(array_ops.stop_gradient(r, name='renorm_r'))
        d = _broadcast(array_ops.stop_gradient(d, name='renorm_d'))
        scale, offset = _compose_transforms(r, d, scale, offset)

      def _do_update(var, value):
        """Compute the updates for mean and variance."""
        return self._assign_moving_average(var, value, self.momentum,
                                           inputs_size)

      def mean_update():
        true_branch = lambda: _do_update(self.moving_mean, new_mean)
        false_branch = lambda: self.moving_mean
        return tf_utils.smart_cond(training, true_branch, false_branch)

      def variance_update():
        """Update the moving variance."""

        def true_branch_renorm():
          # We apply epsilon as part of the moving_stddev to mirror the training
          # code path.
          moving_stddev = _do_update(self.moving_stddev,
                                     math_ops.sqrt(new_variance + self.epsilon))
          return self._assign_new_value(
              self.moving_variance,
              # Apply relu in case floating point rounding causes it to go
              # negative.
              K.relu(moving_stddev * moving_stddev - self.epsilon))

        if self.renorm:
          true_branch = true_branch_renorm
        else:
          true_branch = lambda: _do_update(self.moving_variance, new_variance)

        false_branch = lambda: self.moving_variance
        return tf_utils.smart_cond(training, true_branch, false_branch)

      self.add_update(mean_update)
      self.add_update(variance_update)

    mean = math_ops.cast(mean, inputs.dtype)
    variance = math_ops.cast(variance, inputs.dtype)
    if offset is not None:
      offset = math_ops.cast(offset, inputs.dtype)
    if scale is not None:
      scale = math_ops.cast(scale, inputs.dtype)
    # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing
    # math in float16 hurts validation accuracy of popular models like resnet.
    outputs = nn.batch_normalization(inputs,
                                     _broadcast(mean),
                                     _broadcast(variance),
                                     offset,
                                     scale,
                                     self.epsilon)
    # If some components of the shape got lost due to adjustments, fix that.
    outputs.set_shape(input_shape)

    if self.virtual_batch_size is not None:
      outputs = undo_virtual_batching(outputs)
    return outputs
Example #54
0
 def _zeros(self):
     return array_ops.zeros_like(self.a + self.b)
Example #55
0
 def _variance(self):
   return array_ops.zeros_like(self.loc)
def safe_embedding_lookup_sparse(
    embedding_weights,
    sparse_ids,
    sparse_weights=None,
    combiner="mean",
    default_id=None,
    name="safe_embedding_lookup_sparse",
    partition_strategy=None,  # no used
    max_norm=None,
    return_trainable=False,
):
    """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`.

    Lookup embedding results, accounting for empty features and invalid weights.

    Any IDs will be treated as valid include non-positive IDs.
    Invalid weights (<= 0) are pruned from input weights, as well as any IDs
    with non-positive weight. For an entry with no features, the embedding vector
    for `default_id` is returned, or the 0-vector if `default_id` is not supplied.

    The ids and weights may be multi-dimensional. Embeddings are always aggregated
    along the last dimension.

    Args:
      embedding_weights: A single `dynamic_embedding.Variable` instance
        representing the complete embedding tensor.
      sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the
        ids. `d_0` is typically batch size.
      sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing
        float weights corresponding to `sparse_ids`, or `None` if all weights are
        be assumed to be 1.0.
      combiner: A string specifying how to combine embedding results for each
        entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the
        default.
      default_id: The id to use for an entry with no features.
      name: A name for this operation (optional).
      partition_strategy: A string specifying the partitioning strategy. Currently
        `"div"` and `"mod"` are supported. Default is `"div"`.
      max_norm: If not `None`, all embeddings are l2-normalized to max_norm before
        combining.

    Returns:
      combined_embeddings:
        A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`.
      trainable_wrap:
        A TrainableWrapper object used to fill the Optimizers `var_list`
          Only provided if `return_trainable` is True.

    Raises:
      ValueError: if `embedding_weights` is empty.
    """
    if embedding_weights is None:
        raise ValueError("Missing embedding_weights %s." % embedding_weights)

    if embedding_weights.key_dtype != sparse_ids.dtype:
        raise TypeError(
            "embedding_weights.key_dtype should be same with sparse_ids.dtype: "
            "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype))

    weights_dtype = sparse_weights.dtype if sparse_weights is not None else None
    if weights_dtype and embedding_weights.value_dtype != weights_dtype:
        raise TypeError(
            "embedding_weights.value_dtype should be same with sparse_weights.dtype"
            ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype))

    scope = variable_scope.get_variable_scope()
    full_name = scope.name + "/" + name if scope.name else name
    with ops.name_scope(full_name + "/"):
        # Reshape higher-rank sparse ids and weights to linear segment ids.
        original_shape = sparse_ids.dense_shape
        original_rank_dim = tensor_shape.dimension_value(
            sparse_ids.dense_shape.get_shape()[0])
        original_rank = (array_ops.size(original_shape)
                         if original_rank_dim is None else original_rank_dim)
        sparse_ids = sparse_ops.sparse_reshape(
            sparse_ids,
            [
                math_ops.reduce_prod(
                    array_ops.slice(original_shape, [0], [original_rank - 1])),
                array_ops.gather(original_shape, original_rank - 1),
            ],
        )
        if sparse_weights is not None:
            sparse_weights = sparse_tensor.SparseTensor(
                sparse_ids.indices, sparse_weights.values,
                sparse_ids.dense_shape)

        # Prune invalid weights.
        if combiner != "sum":
            sparse_ids, sparse_weights = _prune_invalid_weights(
                sparse_ids, sparse_weights)

        # Fill in dummy values for empty features, if necessary.
        sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(
            sparse_ids, default_id or 0)
        if sparse_weights is not None:
            sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(
                sparse_weights, 1.0)

        result, trainable_ = embedding_lookup_sparse(
            embedding_weights,
            sparse_ids,
            sparse_weights,
            combiner=combiner,
            partition_strategy=partition_strategy,
            name=name + "/embedding_lookup_sparse",
            max_norm=max_norm,
            return_trainable=True,
        )

        if default_id is None:
            # Broadcast is_row_empty to the same shape as embedding_lookup_result,
            # for use in Select.
            is_row_empty = array_ops.tile(
                array_ops.reshape(is_row_empty, [-1, 1]),
                array_ops.stack([1, array_ops.shape(result)[1]]),
            )

            result = array_ops.where(is_row_empty,
                                     array_ops.zeros_like(result),
                                     result,
                                     name="where")

        # Reshape back from linear ids back into higher-dimensional dense result.
        final_result = array_ops.reshape(
            result,
            array_ops.concat(
                [
                    array_ops.slice(
                        math_ops.cast(original_shape, dtypes.int32),
                        [0],
                        [original_rank - 1],
                    ),
                    array_ops.slice(array_ops.shape(result), [1], [-1]),
                ],
                0,
            ),
        )
        final_result.set_shape(
            tensor_shape.unknown_shape(
                (tensor_shape.Dimension(original_rank_dim) -
                 1).value).concatenate(result.get_shape()[1:]))
        return (final_result, trainable_) if return_trainable else final_result
 def testNonDictMultiplierRaisesError(self):
     gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32)
     variable = variables_lib.Variable(array_ops.zeros_like(gradient))
     grad_to_var = (gradient, variable)
     with self.assertRaises(ValueError):
         learning.multiply_gradients([grad_to_var], 3)
Example #58
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0, and decay=1 meaning no updates.
        r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
        d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
        decay = _smart_select(training, lambda: self.renorm_momentum,
                              lambda: 1.)

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            # Update the variables without zero debiasing. The debiasing will be
            # accomplished by dividing the exponential moving average by the weight.
            # For example, after a single update, the moving average would be
            # (1-decay) * value. and the weight will be 1-decay, with their ratio
            # giving value.
            # Make sure the weight is not updated until before r and d computation.
            value = array_ops.identity(value)
            with ops.control_dependencies([value]):
                weight_value = array_ops.constant(1., dtype=weight.dtype)
            new_var = moving_averages.assign_moving_average(var,
                                                            value,
                                                            decay,
                                                            zero_debias=False)
            new_weight = moving_averages.assign_moving_average(
                weight, weight_value, decay, zero_debias=False)
            return new_var / new_weight

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Example #59
0
def _SelectGrad(op, grad):
    c = op.inputs[0]
    x = op.inputs[1]
    zeros = array_ops.zeros_like(x)
    return (None, array_ops.where(c, grad,
                                  zeros), array_ops.where(c, zeros, grad))
Example #60
0
 def testZerosLikePartialShape(self):
     d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None])
     z = array_ops.zeros_like(d)
     self.assertEqual(d.get_shape().as_list(), z.get_shape().as_list())