def _log_survival_function(self, y): low = self._low high = self._high # Recall the promise: # survival_function(y) := P[Y > y] # = 0, if y >= high, # = 1, if y < low, # = P[X > y], otherwise. # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in # between. j = math_ops.ceil(y) # P[X > j], used when low < X < high. result_so_far = self.distribution.log_survival_function(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if low is not None: result_so_far = array_ops.where(j < low, array_ops.zeros_like(result_so_far), result_so_far) if high is not None: neg_inf = -np.inf * array_ops.ones_like(result_so_far) result_so_far = array_ops.where(j >= high, neg_inf, result_so_far) return result_so_far
def _cdf(self, y): low = self._low high = self._high # Recall the promise: # cdf(y) := P[Y <= y] # = 1, if y >= high, # = 0, if y < low, # = P[X <= y], otherwise. # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in # between. j = math_ops.floor(y) # P[X <= j], used when low < X < high. result_so_far = self.distribution.cdf(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if low is not None: result_so_far = array_ops.where(j < low, array_ops.zeros_like(result_so_far), result_so_far) if high is not None: result_so_far = array_ops.where(j >= high, array_ops.ones_like(result_so_far), result_so_far) return result_so_far
def _survival_function(self, y): lower_cutoff = self._lower_cutoff upper_cutoff = self._upper_cutoff # Recall the promise: # survival_function(y) := P[Y > y] # = 0, if y >= upper_cutoff, # = 1, if y < lower_cutoff, # = P[X > y], otherwise. # P[Y > j] = P[ceiling(Y) > j] since mass is only at integers, not in # between. j = math_ops.ceil(y) # P[X > j], used when lower_cutoff < X < upper_cutoff. result_so_far = self.distribution.survival_function(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if lower_cutoff is not None: result_so_far = math_ops.select(j < lower_cutoff, array_ops.ones_like(result_so_far), result_so_far) if upper_cutoff is not None: result_so_far = math_ops.select(j >= upper_cutoff, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def _log_cdf(self, y): lower_cutoff = self._lower_cutoff upper_cutoff = self._upper_cutoff # Recall the promise: # cdf(y) := P[Y <= y] # = 1, if y >= upper_cutoff, # = 0, if y < lower_cutoff, # = P[X <= y], otherwise. # P[Y <= j] = P[floor(Y) <= j] since mass is only at integers, not in # between. j = math_ops.floor(y) result_so_far = self.distribution.log_cdf(j) # Broadcast, because it's possible that this is a single distribution being # evaluated on a number of samples, or something like that. j += array_ops.zeros_like(result_so_far) # Re-define values at the cutoffs. if lower_cutoff is not None: neg_inf = -np.inf * array_ops.ones_like(result_so_far) result_so_far = math_ops.select(j < lower_cutoff, neg_inf, result_so_far) if upper_cutoff is not None: result_so_far = math_ops.select(j >= upper_cutoff, array_ops.zeros_like(result_so_far), result_so_far) return result_so_far
def fwd_gradients(ys, xs, grad_xs=None, stop_gradients=None): """Compute forward-mode gradients.""" # See b/37888268. # This version of forward-mode autodiff is based on code by Tim Cooijmans # and handles list arguments and certain special cases such as when the # ys doesn't depend on one or more of the xs, and when ops.IndexedSlices are # generated by the first gradients_impl.gradients call. us = [array_ops.zeros_like(y) + float("nan") for y in ys] dydxs = gradients_impl.gradients( ys, xs, grad_ys=us, stop_gradients=stop_gradients) # Deal with strange types that gradients_impl.gradients returns but can't # deal with. dydxs = [ ops.convert_to_tensor(dydx) if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs ] dydxs = [ array_ops.zeros_like(x) if dydx is None else dydx for x, dydx in zip(xs, dydxs) ] dysdx = gradients_impl.gradients(dydxs, us, grad_ys=grad_xs) return dysdx
def Loop(cell, w, i): x = array_ops.unstack(i, self.NUM_UNROLL) m = array_ops.zeros_like(x[0]) c = array_ops.zeros_like(x[0]) for i in range(self.NUM_UNROLL): m, c = cell(x[i], m, c, w) return m
def LSTMLoop10(weights, inp): x = array_ops.unstack(inp, self.NUM_UNROLL) m = array_ops.zeros_like(x[0]) c = array_ops.zeros_like(x[0]) assert self.NUM_UNROLL % 10 == 0 for i in range(0, self.NUM_UNROLL, 10): m, c = Loop10(weights, m, c, *x[i:i + 10]) return m
def _get_chol_and_x_compatible_shape(self, x): """Return self.chol and x, (possibly) broadcast to compatible shape.""" # x and chol are "compatible" if their shape matches except for the last two # dimensions of chol are [k, k], and the last two of x are [k, 1]. # E.g. x.shape = [A, B, k, 1], and chol.shape = [A, B, k, k] # This is required for the batch_triangular_solve, which does not broadcast. # TODO(langmore) This broadcast replicates matrices unnecesarily! In the # case where # x.shape = [M1,...,Mr, N1,...,Nb, k], and chol.shape = [N1,...,Nb, k, k] # (which is common if x was sampled), the front dimensions of x can be # "flipped" to the end, making # x_flipped.shape = [N1,...,Nb, k, M1*...*Mr], # and this can be handled by the linear solvers. This is preferred, because # it does not replicate the matrix, or create any new data. # We assume x starts without the trailing singleton dimension, e.g. # x.shape = [B, k]. chol = self._chol with ops.op_scope([x] + self.inputs, 'get_chol_and_x_compatible_shape'): # If we determine statically that shapes match, we're done. if x.get_shape() == chol.get_shape()[:-1]: x_expanded = array_ops.expand_dims(x, -1) return chol, x_expanded # Dynamic check if shapes match or not. vector_shape = self.vector_shape() # Shape of chol minus last dim. are_same_rank = math_ops.equal( array_ops.rank(x), array_ops.rank(vector_shape)) def shapes_match_if_same_rank(): return math_ops.reduce_all(math_ops.equal( array_ops.shape(x), vector_shape)) shapes_match = control_flow_ops.cond(are_same_rank, shapes_match_if_same_rank, lambda: ops.convert_to_tensor(False)) # Make tensors (never instantiated) holding the broadcast shape. # matrix_broadcast_dummy is the shape we will broadcast chol to. matrix_bcast_dummy = chol + array_ops.expand_dims(x, -1) # vector_bcast_dummy is the shape we will bcast x to, before we expand it. chol_minus_last_dim = math_ops.reduce_sum(chol, reduction_indices=[-1]) vector_bcast_dummy = x + chol_minus_last_dim chol_bcast = chol + array_ops.zeros_like(matrix_bcast_dummy) x_bcast = x + array_ops.zeros_like(vector_bcast_dummy) chol_result = control_flow_ops.cond(shapes_match, lambda: chol, lambda: chol_bcast) chol_result.set_shape(matrix_bcast_dummy.get_shape()) x_result = control_flow_ops.cond(shapes_match, lambda: x, lambda: x_bcast) x_result.set_shape(vector_bcast_dummy.get_shape()) x_expanded = array_ops.expand_dims(x_result, -1) return chol_result, x_expanded
def _forward(self, x): if self._unroll_loop: event_size = tensor_shape.dimension_value( x.shape.with_rank_at_least(1)[-1]) if event_size is None: raise ValueError( "The final dimension of `x` must be known at graph construction " "time if `unroll_loop=True`. `x.shape: %r`" % x.shape) y = array_ops.zeros_like(x, name="y0") for _ in range(event_size): shift, log_scale = self._shift_and_log_scale_fn(y) # next_y = scale * x + shift next_y = x if log_scale is not None: next_y *= math_ops.exp(log_scale) if shift is not None: next_y += shift y = next_y return y event_size = array_ops.shape(x)[-1] # If the event size is available at graph construction time, we can inform # the graph compiler of the maximum number of steps. If not, # static_event_size will be None, and the maximum_iterations argument will # have no effect. static_event_size = tensor_shape.dimension_value( x.shape.with_rank_at_least(1)[-1]) y0 = array_ops.zeros_like(x, name="y0") # call the template once to ensure creation _ = self._shift_and_log_scale_fn(y0) def _loop_body(index, y0): """While-loop body for autoregression calculation.""" # Set caching device to avoid re-getting the tf.Variable for every while # loop iteration. with variable_scope_lib.variable_scope( variable_scope_lib.get_variable_scope()) as vs: if vs.caching_device is None: vs.set_caching_device(lambda op: op.device) shift, log_scale = self._shift_and_log_scale_fn(y0) y = x if log_scale is not None: y *= math_ops.exp(log_scale) if shift is not None: y += shift return index + 1, y _, y = control_flow_ops.while_loop( cond=lambda index, _: index < event_size, body=_loop_body, loop_vars=(0, y0), maximum_iterations=static_event_size) return y
def _cdf(self, counts): counts = self._maybe_assert_valid_sample(counts) probs = self.probs if not (counts.shape.is_fully_defined() and self.probs.shape.is_fully_defined() and counts.shape.is_compatible_with(self.probs.shape)): # If both shapes are well defined and equal, we skip broadcasting. probs += array_ops.zeros_like(counts) counts += array_ops.zeros_like(self.probs) return _bdtr(k=counts, n=self.total_count, p=probs)
def fwd_gradients(ys, xs, grad_xs=None, assert_unused=False): """Computes forward-mode derivatives. This is accomplished in pure-python using tensorflow's existing (reverse-mode) gradients. There is additional overhead on graph construction, but runtime performance should be equal to a manual implementation [citation needed]. See https://j-towns.github.io/2017/06/12/A-new-trick.html and https://github.com/HIPS/autograd/pull/175 for the original discussion of this method, and https://github.com/renmengye/tensorflow-forward-ad for a "direct" implementation. Args: ys: A list of tensors. xs: A list of tensors. grad_xs: An optional list of tensors. If provided, must have the same length and shapes compatible with xs. assert_unused: Add assertions that intermediate values are not computed. Returns: A list of tensors of the same shapes as ys. The directional derivatives of ys with respect to xs in the direction grad_xs. Leaving grad_xs unspecified is equivalent to passing in 1s for each x in xs. """ # This version of forward-mode autodiff is based on code by Tim Cooijmans # and handles list arguments and certain special cases such as when the # ys doesn't depend on one or more of the xs, and when tf.IndexedSlices are # generated by the first tf.gradients call. us = [array_ops.zeros_like(y) + float('nan') for y in ys] dydxs = gradients(ys, xs, grad_ys=us) # deal with strange types that tf.gradients returns but can't deal with dydxs = [ops.convert_to_tensor(dydx) if isinstance(dydx, ops.IndexedSlices) else dydx for dydx in dydxs] if assert_unused: with ops.control_dependencies(dydxs): assert_unused = control_flow_ops.Assert(False, [1], name='fwd_gradients') with ops.control_dependencies([assert_unused]): dydxs = array_ops.identity_n(dydxs) dydxs = [array_ops.zeros_like(x) if dydx is None else dydx for x, dydx in zip(xs, dydxs)] for x, dydx in zip(xs, dydxs): dydx.set_shape(x.shape) dysdx = gradients(dydxs, us, grad_ys=grad_xs) return dysdx
def sparsemax_loss(logits, sparsemax, labels, name=None): """Computes sparsemax loss function [1]. [1]: https://arxiv.org/abs/1602.02068 Args: logits: A `Tensor`. Must be one of the following types: `half`, `float32`, `float64`. sparsemax: A `Tensor`. Must have the same type as `logits`. labels: A `Tensor`. Must have the same type as `logits`. name: A name for the operation (optional). Returns: A `Tensor`. Has the same type as `logits`. """ with ops.name_scope(name, "sparsemax_loss", [logits, sparsemax, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") sparsemax = ops.convert_to_tensor(sparsemax, name="sparsemax") labels = ops.convert_to_tensor(labels, name="labels") # In the paper, they call the logits z. # A constant can be substracted from logits to make the algorithm # more numerically stable in theory. However, there are really no major # source numerical instability in this algorithm. z = logits # sum over support # Use a conditional where instead of a multiplication to support z = -inf. # If z = -inf, and there is no support (sparsemax = 0), a multiplication # would cause 0 * -inf = nan, which is not correct in this case. sum_s = array_ops.where( math_ops.logical_or(sparsemax > 0, math_ops.is_nan(sparsemax)), sparsemax * (z - 0.5 * sparsemax), array_ops.zeros_like(sparsemax)) # - z_k + ||q||^2 q_part = labels * (0.5 * labels - z) # Fix the case where labels = 0 and z = -inf, where q_part would # otherwise be 0 * -inf = nan. But since the lables = 0, no cost for # z = -inf should be consideredself. # The code below also coveres the case where z = inf. Howeverm in this # caose the sparsemax will be nan, which means the sum_s will also be nan, # therefor this case doesn't need addtional special treatment. q_part_safe = array_ops.where( math_ops.logical_and(math_ops.equal(labels, 0), math_ops.is_inf(z)), array_ops.zeros_like(z), q_part) return math_ops.reduce_sum(sum_s + q_part_safe, axis=1)
def __init__(self, inputs, sequence_length, time_major=False, name=None): """Initializer. Args: inputs: A (structure of) input tensors. sequence_length: An int32 vector tensor. time_major: Python bool. Whether the tensors in `inputs` are time major. If `False` (default), they are assumed to be batch major. name: Name scope for any created operations. Raises: ValueError: if `sequence_length` is not a 1D tensor. """ with ops.name_scope(name, "TrainingHelper", [inputs, sequence_length]): inputs = ops.convert_to_tensor(inputs, name="inputs") if not time_major: inputs = nest.map_structure(_transpose_batch_time, inputs) self._input_tas = nest.map_structure(_unstack_ta, inputs) self._sequence_length = ops.convert_to_tensor( sequence_length, name="sequence_length") if self._sequence_length.get_shape().ndims != 1: raise ValueError( "Expected sequence_length to be a vector, but received shape: %s" % self._sequence_length.get_shape()) self._zero_inputs = nest.map_structure( lambda inp: array_ops.zeros_like(inp[0, :]), inputs) self._batch_size = array_ops.size(sequence_length)
def _logits_to_prediction(self, logits=None): predictions = {} predictions[PredictionKey.LOGITS] = logits logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[PredictionKey.CLASSES] = math_ops.argmax(logits, 1) return predictions
def _compareZeros(self, dtype, fully_defined_shape, use_gpu): with self.test_session(use_gpu=use_gpu): # Creates a tensor of non-zero values with shape 2 x 3. # NOTE(kearnes): The default numpy dtype associated with tf.string is # np.object (and can't be changed without breaking a lot things), which # causes a TypeError in constant_op.constant below. Here we catch the # special case of tf.string and set the numpy dtype appropriately. if dtype == dtypes_lib.string: numpy_dtype = np.string_ else: numpy_dtype = dtype.as_numpy_dtype if fully_defined_shape: d = constant_op.constant( np.ones((2, 3), dtype=numpy_dtype), dtype=dtype) else: d = array_ops.placeholder(dtype=dtype) # Constructs a tensor of zeros of the same dimensions and type as "d". z_var = array_ops.zeros_like(d) # Test that the type is correct self.assertEqual(z_var.dtype, dtype) # Test that the shape is correct if fully_defined_shape: self.assertEqual([2, 3], z_var.get_shape()) # Test that the value is correct feed_dict = {} if not fully_defined_shape: feed_dict[d] = np.ones((2, 3), dtype=numpy_dtype) z_value = z_var.eval(feed_dict=feed_dict) self.assertFalse(np.any(z_value)) self.assertEqual((2, 3), z_value.shape)
def _f(): # Note that there is a race condition here, so we do a best effort # updates here. We reset update_in_steps first so that other workers # don't duplicate the updates. Also we update cluster_center_vars # before resetting total_counts to avoid large updates to # cluster_centers_updated based on partially updated # cluster_center_vars. with ops.control_dependencies([ state_ops.assign(update_in_steps, self._mini_batch_steps_per_iteration - 1) ]): with ops.colocate_with( cluster_centers_updated, ignore_existing=True): if self._distance_metric == COSINE_DISTANCE: cluster_centers = nn_impl.l2_normalize( cluster_centers_updated, dim=1) else: cluster_centers = cluster_centers_updated with ops.colocate_with(cluster_centers_var, ignore_existing=True): with ops.control_dependencies( [state_ops.assign(cluster_centers_var, cluster_centers)]): with ops.colocate_with(None, ignore_existing=True): with ops.control_dependencies([ state_ops.assign(total_counts, array_ops.zeros_like(total_counts)) ]): return array_ops.identity(update_in_steps)
def testZerosLikeUninitialized(self): l0 = list_ops.tensor_list_reserve([], 3, element_dtype=dtypes.float32) l1 = list_ops.tensor_list_set_item(l0, 0, 1.) # [1., _, _] zeros_1 = array_ops.zeros_like(l1) # [0., _, _] l2 = list_ops.tensor_list_set_item(l1, 2, 2.) # [1., _, 2.] zeros_2 = array_ops.zeros_like(l2) # [0., _, 0.] # Gather indices with zeros in `zeros_1`. res_1 = list_ops.tensor_list_gather( zeros_1, [0], element_dtype=dtypes.float32) # Gather indices with zeros in `zeros_2`. res_2 = list_ops.tensor_list_gather( zeros_2, [0, 2], element_dtype=dtypes.float32) self.assertAllEqual(self.evaluate(res_1), [0.]) self.assertAllEqual(self.evaluate(res_2), [0., 0.])
def _num_present(losses, weights, per_batch=False): """Computes the number of elements in the loss function induced by `weights`. A given weights tensor induces different numbers of usable elements in the `losses` tensor. The `weights` tensor is broadcast across `losses` for all possible dimensions. For example, if `losses` is a tensor of dimension `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is, in effect, tiled to match the shape of `losses`. Following this effective tile, the total number of present elements is the number of non-zero weights. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. weights: `Tensor` of shape `[]`, `[batch_size]` or `[batch_size, d1, ... dK]`, where K < N. per_batch: Whether to return the number of elements per batch or as a sum total. Returns: The number of present (non-zero) elements in the losses tensor. If `per_batch` is `True`, the value is returned as a tensor of size `[batch_size]`. Otherwise, a single scalar tensor is returned. """ with ops.name_scope(None, "num_present", (losses, weights)) as scope: weights = math_ops.to_float(weights) present = array_ops.where( math_ops.equal(weights, 0.0), array_ops.zeros_like(weights), array_ops.ones_like(weights)) present = weights_broadcast_ops.broadcast_weights(present, losses) if per_batch: return math_ops.reduce_sum( present, axis=math_ops.range(1, array_ops.rank(present)), keep_dims=True, name=scope) return math_ops.reduce_sum(present, name=scope)
def power_sums_tensor(array_size, power_matrix, multiplier): r"""Computes \sum_{i=0}^{N-1} A^i B (A^i)^T for N=0..(array_size + 1). Args: array_size: The number of non-trivial sums to pre-compute. power_matrix: The "A" matrix above. multiplier: The "B" matrix above Returns: A Tensor with S[N] = \sum_{i=0}^{N-1} A^i B (A^i)^T S[0] is the zero matrix S[1] is B S[2] is A B A^T + B ...and so on """ array_size = math_ops.cast(array_size, dtypes.int32) power_matrix = ops.convert_to_tensor(power_matrix) identity_like_power_matrix = linalg_ops.eye( array_ops.shape(power_matrix)[0], dtype=power_matrix.dtype) identity_like_power_matrix.set_shape( ops.convert_to_tensor(power_matrix).get_shape()) transition_powers = functional_ops.scan( lambda previous_power, _: math_ops.matmul(previous_power, power_matrix), math_ops.range(array_size - 1), initializer=identity_like_power_matrix) summed = math_ops.cumsum( array_ops.concat([ array_ops.expand_dims(multiplier, 0), math_ops.matmul( batch_times_matrix(transition_powers, multiplier), transition_powers, adjoint_b=True) ], 0)) return array_ops.concat( [array_ops.expand_dims(array_ops.zeros_like(multiplier), 0), summed], 0)
def _compute_gradients(tensor, var_list): grads = gradients.gradients(tensor, var_list) # tf.gradients sometimes returns `None` when it should return 0. return [ grad if grad is not None else array_ops.zeros_like(var) for var, grad in zip(var_list, grads) ]
def gcd(a, b, name=None): """Returns the greatest common divisor via Euclid's algorithm. Args: a: The dividend. A scalar integer `Tensor`. b: The divisor. A scalar integer `Tensor`. name: An optional name for the operation. Returns: A scalar `Tensor` representing the greatest common divisor between `a` and `b`. Raises: ValueError: If `a` or `b` are not scalar integers. """ with ops.name_scope(name, 'gcd', [a, b]): a = ops.convert_to_tensor(a) b = ops.convert_to_tensor(b) a.shape.assert_has_rank(0) b.shape.assert_has_rank(0) if not a.dtype.is_integer: raise ValueError('a must be an integer type. Got: %s' % a.dtype) if not b.dtype.is_integer: raise ValueError('b must be an integer type. Got: %s' % b.dtype) cond = lambda _, b: math_ops.greater(b, array_ops.zeros_like(b)) body = lambda a, b: [b, math_ops.mod(a, b)] a, b = control_flow_ops.while_loop(cond, body, [a, b], back_prop=False) return a
def Grad(op, *args): """The python grad function for the Forward function.""" # NOTE: tf.gradient backprops None for int32/int64 while zeros # for float32/float64. For consistency, we always backprop # zeros. args = list(args) for i, dy in enumerate(args): if dy is None: args[i] = array_ops.zeros_like(op.outputs[i]) # TODO(drpng): getting the extra state here? op_inputs = [x for x in op.inputs] op_struct = [ self._theta, self._state, self._inputs, self._max_input_length, self._extras ] (theta, state0, inputs, max_input_length, _) = _Pack(op_inputs, op_struct) # acc_state and acc_extras are computed by the Forward pass and # needed by the Backward pass. acc_state, _, acc_extras = _Pack([x for x in op.outputs], [self._state, self._state, self._extras]) # Forward computes acc_state, the final state and # acc_extras. tf.gradients gives us their gradients w.r.t. the # final loss. Because acc_extras are not exposed by Compute(), # it has no gradients w.r.t. the final loss (i.e., by # construction, it must be zeros). d_acc_state, d_state1, _ = _Pack(args, [self._state, self._state, self._extras]) return Backward(*_Flatten([ theta, state0, inputs, max_input_length, acc_state, acc_extras, d_acc_state, d_state1 ]))
def _single_seq_fn(): log_norm = math_ops.reduce_logsumexp(first_input, [1]) # Mask `log_norm` of the sequences with length <= zero. log_norm = array_ops.where(math_ops.less_equal(sequence_lengths, 0), array_ops.zeros_like(log_norm), log_norm) return log_norm
def _TensorScatterUpdateGrad(op, grad): indices = op.inputs[1] updates_grad = array_ops.gather_nd(grad, indices) tensor_grad = array_ops.tensor_scatter_update( array_ops.identity(grad), indices, array_ops.zeros_like(op.inputs[2], dtype=grad.dtype)) return [tensor_grad, None, updates_grad]
def _forward(self, x): event_size = array_ops.shape(x)[-1] y0 = array_ops.zeros_like(x, name="y0") # call the template once to ensure creation _ = self._shift_and_log_scale_fn(y0) def _loop_body(index, y0): """While-loop body for autoregression calculation.""" # Set caching device to avoid re-getting the tf.Variable for every while # loop iteration. with variable_scope_lib.variable_scope( variable_scope_lib.get_variable_scope()) as vs: if vs.caching_device is None: vs.set_caching_device(lambda op: op.device) shift, log_scale = self._shift_and_log_scale_fn(y0) y = x if log_scale is not None: y *= math_ops.exp(log_scale) if shift is not None: y += shift return index + 1, y _, y = control_flow_ops.while_loop( cond=lambda index, _: index < event_size, body=_loop_body, loop_vars=[0, y0]) return y
def full_exp_with_logits(name, labels=None, logits=None): """Computes exponential loss given `logits`. Args: name: A name for the operation (optional). labels: A `Tensor` of the same type and shape as `logits`. logits: A `Tensor` of type `float32` or `float64`. Returns: A `Tensor` of the same shape as `logits` with the componentwise exponential losses. Raises: ValueError: If `logits` and `labels` do not have the same shape. """ with ops.name_scope(name, "exp_loss", [logits, labels]) as name: logits = ops.convert_to_tensor(logits, name="logits") labels = ops.convert_to_tensor(labels, name="labels") try: labels.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError("logits and labels must have the same shape (%s vs %s)" % (logits.get_shape(), labels.get_shape())) # Default threshold of 0 to switch between classes zeros = array_ops.zeros_like(logits, dtype=logits.dtype) ones = array_ops.ones_like(logits, dtype=logits.dtype) neg_ones = -array_ops.ones_like(logits, dtype=logits.dtype) # Convert labels to 1 and -1 cond_labels = (labels > zeros) labels_converted = array_ops.where(cond_labels, ones, neg_ones) return math_ops.exp(-1.0 * logits * labels_converted)
def _SparseDenseCwiseMulOrDivGrad(op, grad, is_mul): """Common code for SparseDenseCwise{Mul,Div} gradients.""" x_indices = op.inputs[0] x_shape = op.inputs[2] y = op.inputs[3] y_shape = math_ops.to_int64(array_ops.shape(y)) num_added_dims = array_ops.expand_dims( array_ops.size(x_shape) - array_ops.size(y_shape), 0) augmented_y_shape = array_ops.concat( [array_ops.ones(num_added_dims, ops.dtypes.int64), y_shape], 0) scaling = x_shape // augmented_y_shape scaled_indices = x_indices // scaling scaled_indices = array_ops.slice(scaled_indices, array_ops.concat([[0], num_added_dims], 0), [-1, -1]) dense_vals = array_ops.gather_nd(y, scaled_indices) if is_mul: dx = grad * dense_vals dy_val = grad * op.inputs[1] else: dx = grad / dense_vals dy_val = grad * (-op.inputs[1] / math_ops.square(dense_vals)) # indices can repeat after scaling, so we can't use sparse_to_dense(). dy = sparse_ops.sparse_add( array_ops.zeros_like(y), sparse_tensor.SparseTensor(scaled_indices, dy_val, y_shape)) # (sp_indices, sp_vals, sp_shape, dense) return (None, dx, None, dy)
def _safe_div(numerator, denominator, name="value"): """Computes a safe divide which returns 0 if the denominator is zero. Note that the function contains an additional conditional check that is necessary for avoiding situations where the loss is zero causing NaNs to creep into the gradient computation. Args: numerator: An arbitrary `Tensor`. denominator: A `Tensor` whose shape matches `numerator` and whose values are assumed to be non-negative. name: An optional name for the returned op. Returns: The element-wise value of the numerator divided by the denominator. """ if compat.forward_compatible(2018, 11, 1): return math_ops.div_no_nan(numerator, denominator, name=name) return array_ops.where( math_ops.greater(denominator, 0), math_ops.div(numerator, array_ops.where( math_ops.equal(denominator, 0), array_ops.ones_like(denominator), denominator)), array_ops.zeros_like(numerator), name=name)
def pdf(self, x, name="pdf"): """The PDF of observations in `x` under these Uniform distribution(s). Args: x: tensor of dtype `dtype`, must be broadcastable with `a` and `b`. name: The name to give this op. Returns: pdf: tensor of dtype `dtype`, the pdf values of `x`. If `x` is `nan`, will return `nan`. """ with ops.name_scope(self.name): with ops.op_scope([self.a, self.b, x], name): x = ops.convert_to_tensor(x, name="x") if x.dtype != self.dtype: raise TypeError("Input x dtype does not match dtype: %s vs. %s" % (x.dtype, self.dtype)) broadcasted_x = x * self._ones() return math_ops.select( math_ops.is_nan(broadcasted_x), broadcasted_x, math_ops.select( math_ops.logical_or(broadcasted_x < self.a, broadcasted_x > self.b), array_ops.zeros_like(broadcasted_x), (1.0 / self.range()) * array_ops.ones_like(broadcasted_x)))
def testMultipleOfNoneGradRaisesError(self): gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32) variable = variables_lib.Variable(array_ops.zeros_like(gradient)) grad_to_var = (None, variable) gradient_multipliers = {variable: self._multiplier} with self.assertRaises(ValueError): learning.multiply_gradients(grad_to_var, gradient_multipliers)
def ZerosLikeV1WhileLoop(self, op, index): """Create zeros_like for the specified output of an op. If op is in a while loop that is part of gradients(), this method must be called in its grad loop context. Args: op: A tensorflow operation. index: the index for a specific output of the op. Returns: A zero tensor of the same shape of op.outputs[index]. """ if util.IsLoopSwitch(op): return None if op.graph.building_function: # The optimization here is tricky to apply to functions return array_ops.zeros_like(op.outputs[index]) dead_branch = util.IsSwitch(op) forward_ctxt = util.GetWhileContext(op) grad_state = self._map.get(forward_ctxt) if grad_state is None: # op is not in a while loop that is part of gradients(). return ZerosLike(op, index) op_ctxt = op._get_control_flow_context() val = ops.convert_to_tensor(op.outputs[index], name="tensor") shape = val.get_shape() if shape.is_fully_defined(): # If the shape is known statically, just create a zero tensor with # the right shape in the grad loop context. if val.dtype == dtypes.resource: result = array_ops.zeros( resource_variable_ops.variable_shape(val), dtype=default_gradient.get_zeros_dtype(val)) else: result = constant_op.constant(0, shape=shape.dims, dtype=val.dtype) if dead_branch: # op is a cond switch. Guard the zero tensor with a switch. pred = grad_state.history_map.get(op_ctxt.pred.name) branch = op_ctxt.branch result = control_flow_ops._SwitchRefOrTensor(result, pred)[1 - branch] else: # Unknown shape so keep a history of the shape at runtime. if dead_branch: # Need to add a special switch to guard the value. pred = op_ctxt.pred branch = op_ctxt.branch op_ctxt.outer_context.Enter() val = control_flow_ops._SwitchRefOrTensor(op.inputs[0], pred)[1 - branch] zeros_shape = array_ops.shape_internal(val, optimize=False) op_ctxt.outer_context.Exit() val.op._set_control_flow_context(op_ctxt) zeros_shape.op._set_control_flow_context(op_ctxt) else: op_ctxt.Enter() zeros_shape = array_ops.shape_internal(val, optimize=False) op_ctxt.Exit() # Add forward accumulator for shape. grad_state.grad_context.Exit() history_zeros_shape = grad_state.AddForwardAccumulator( zeros_shape, dead_branch=dead_branch) grad_state.grad_context.Enter() # Create a zero tensor with the right shape. shape = grad_state.AddBackpropAccumulatedValue(history_zeros_shape, zeros_shape, dead_branch) result = array_ops.zeros(shape, val.dtype) return result
def kernel(step_size, n_leapfrog_steps, x, target_log_prob_fn, event_dims=(), x_log_prob=None, x_grad=None, skip_metropolis_step=False, name=None): """Runs one iteration of Hamiltonian Monte Carlo. Hamiltonian Monte Carlo (HMC) is a Markov chain Monte Carlo (MCMC) algorithm that takes a series of gradient-informed steps to produce a Metropolis proposal. This function applies one step of HMC to randomly update the variable `x`. This function can update multiple chains in parallel. It assumes that all dimensions of `x` not specified in `event_dims` are independent, and should therefore be updated independently. The output of `target_log_prob_fn()` should sum log-probabilities across all event dimensions. Slices along dimensions not in `event_dims` may have different target distributions; for example, if `event_dims == (1,)`, then `x[0, :]` could have a different target distribution from x[1, :]. This is up to `target_log_prob_fn()`. Args: step_size: Scalar step size or array of step sizes for the leapfrog integrator. Broadcasts to the shape of `x`. Larger step sizes lead to faster progress, but too-large step sizes make rejection exponentially more likely. When possible, it's often helpful to match per-variable step sizes to the standard deviations of the target distribution in each variable. n_leapfrog_steps: Integer number of steps to run the leapfrog integrator for. Total progress per HMC step is roughly proportional to step_size * n_leapfrog_steps. x: Tensor containing the value(s) of the random variable(s) to update. target_log_prob_fn: Python callable which takes an argument like `initial_x` and returns its (possibly unnormalized) log-density under the target distribution. event_dims: List of dimensions that should not be treated as independent. This allows for multiple chains to be run independently in parallel. Default is (), i.e., all dimensions are independent. x_log_prob (optional): Tensor containing the cached output of a previous call to `target_log_prob_fn()` evaluated at `x` (such as that provided by a previous call to `kernel()`). Providing `x_log_prob` and `x_grad` saves one gradient computation per call to `kernel()`. x_grad (optional): Tensor containing the cached gradient of `target_log_prob_fn()` evaluated at `x` (such as that provided by a previous call to `kernel()`). Providing `x_log_prob` and `x_grad` saves one gradient computation per call to `kernel()`. skip_metropolis_step (optional): boolean specifying whether to skip the Metropolis-Hastings step and directly return the newly proposed values by the integrator. The acceptance probabilities returned remain unchanged. name: Python `str` name prefixed to Ops created by this function. Returns: updated_x: The updated variable(s) x. Has shape matching `initial_x`. acceptance_probs: Tensor with the acceptance probabilities for the final iteration. This is useful for diagnosing step size problems etc. Has shape matching `target_log_prob_fn(initial_x)`. new_log_prob: The value of `target_log_prob_fn()` evaluated at `updated_x`. new_grad: The value of the gradient of `target_log_prob_fn()` evaluated at `updated_x`. #### Examples: ```python # Tuning acceptance rates: target_accept_rate = 0.631 def target_log_prob(x): # Standard normal return tf.reduce_sum(-0.5 * tf.square(x)) initial_x = tf.zeros([10]) initial_log_prob = target_log_prob(initial_x) initial_grad = tf.gradients(initial_log_prob, initial_x)[0] # Algorithm state x = tf.Variable(initial_x, name='x') step_size = tf.Variable(1., name='step_size') last_log_prob = tf.Variable(initial_log_prob, name='last_log_prob') last_grad = tf.Variable(initial_grad, name='last_grad') # Compute updates new_x, acceptance_prob, log_prob, grad = hmc.kernel(step_size, 3, x, target_log_prob, event_dims=[0], x_log_prob=last_log_prob) x_update = tf.assign(x, new_x) log_prob_update = tf.assign(last_log_prob, log_prob) grad_update = tf.assign(last_grad, grad) step_size_update = tf.assign(step_size, tf.where(acceptance_prob > target_accept_rate, step_size * 1.01, step_size / 1.01)) adaptive_updates = [x_update, log_prob_update, grad_update, step_size_update] sampling_updates = [x_update, log_prob_update, grad_update] sess = tf.Session() sess.run(tf.global_variables_initializer()) # Warm up the sampler and adapt the step size for i in xrange(500): sess.run(adaptive_updates) # Collect samples without adapting step size samples = np.zeros([500, 10]) for i in xrange(500): x_val, _ = sess.run([new_x, sampling_updates]) samples[i] = x_val ``` ```python # Empirical-Bayes estimation of a hyperparameter by MCMC-EM: # Problem setup N = 150 D = 10 x = np.random.randn(N, D).astype(np.float32) true_sigma = 0.5 true_beta = true_sigma * np.random.randn(D).astype(np.float32) y = x.dot(true_beta) + np.random.randn(N).astype(np.float32) def log_prior(beta, log_sigma): return tf.reduce_sum(-0.5 / tf.exp(2 * log_sigma) * tf.square(beta) - log_sigma) def regression_log_joint(beta, log_sigma, x, y): # This function returns log p(beta | log_sigma) + log p(y | x, beta). means = tf.matmul(tf.expand_dims(beta, 0), x, transpose_b=True) means = tf.squeeze(means) log_likelihood = tf.reduce_sum(-0.5 * tf.square(y - means)) return log_prior(beta, log_sigma) + log_likelihood def log_joint_partial(beta): return regression_log_joint(beta, log_sigma, x, y) # Our estimate of log(sigma) log_sigma = tf.Variable(0., name='log_sigma') # The state of the Markov chain beta = tf.Variable(tf.random_normal([x.shape[1]]), name='beta') new_beta, _, _, _ = hmc.kernel(0.1, 5, beta, log_joint_partial, event_dims=[0]) beta_update = tf.assign(beta, new_beta) optimizer = tf.train.GradientDescentOptimizer(learning_rate=0.01) with tf.control_dependencies([beta_update]): log_sigma_update = optimizer.minimize(-log_prior(beta, log_sigma), var_list=[log_sigma]) sess = tf.Session() sess.run(tf.global_variables_initializer()) log_sigma_history = np.zeros(1000) for i in xrange(1000): log_sigma_val, _ = sess.run([log_sigma, log_sigma_update]) log_sigma_history[i] = log_sigma_val # Should converge to something close to true_sigma plt.plot(np.exp(log_sigma_history)) ``` """ with ops.name_scope(name, 'hmc_kernel', [step_size, n_leapfrog_steps, x]): potential_and_grad = _make_potential_and_grad(target_log_prob_fn) x = ops.convert_to_tensor(x, name='x') x_shape = array_ops.shape(x) m = random_ops.random_normal(x_shape, dtype=x.dtype) kinetic_0 = 0.5 * math_ops.reduce_sum(math_ops.square(m), event_dims) if (x_log_prob is not None) and (x_grad is not None): log_potential_0, grad_0 = -x_log_prob, -x_grad # pylint: disable=invalid-unary-operand-type else: if x_log_prob is not None: logging.warn('x_log_prob was provided, but x_grad was not,' ' so x_log_prob was not used.') if x_grad is not None: logging.warn('x_grad was provided, but x_log_prob was not,' ' so x_grad was not used.') log_potential_0, grad_0 = potential_and_grad(x) new_x, new_m, log_potential_1, grad_1 = leapfrog_integrator( step_size, n_leapfrog_steps, x, m, potential_and_grad, grad_0) kinetic_1 = 0.5 * math_ops.reduce_sum(math_ops.square(new_m), event_dims) energy_change = log_potential_1 - log_potential_0 + kinetic_1 - kinetic_0 # Treat NaN as infinite energy (and therefore guaranteed rejection). energy_change = array_ops.where( math_ops.is_nan(energy_change), array_ops.fill(array_ops.shape(energy_change), energy_change.dtype.as_numpy_dtype(np.inf)), energy_change) acceptance_probs = math_ops.exp(math_ops.minimum(-energy_change, 0.)) # If we are skipping the MH step directly return if skip_metropolis_step: return new_x, acceptance_probs, -log_potential_1, -grad_1 accepted = ( random_ops.random_uniform( array_ops.shape(acceptance_probs), dtype=x.dtype) < acceptance_probs) new_log_prob = -array_ops.where(accepted, log_potential_1, log_potential_0) # TODO(b/65738010): This should work, but it doesn't for now. # reduced_shape = math_ops.reduced_shape(x_shape, event_dims) reduced_shape = array_ops.shape(math_ops.reduce_sum(x, event_dims, keep_dims=True)) accepted = array_ops.reshape(accepted, reduced_shape) accepted = math_ops.logical_or( accepted, math_ops.cast(array_ops.zeros_like(x), dtypes.bool)) new_x = array_ops.where(accepted, new_x, x) new_grad = -array_ops.where(accepted, grad_1, grad_0) # TODO(langmore) Gradients of acceptance_probs and new_log_prob with respect # to initial_x will propagate NaNs (see testNanFromGradsDontPropagate). This # should be fixed. return new_x, acceptance_probs, new_log_prob, new_grad
def safe_embedding_lookup_sparse(embedding_weights, sparse_ids, sparse_weights=None, combiner="mean", default_id=None, name=None, partition_strategy="div"): """Lookup embedding results, accounting for invalid IDs and empty features. The partitioned embedding in `embedding_weights` must all be the same shape except for the first dimension. The first dimension is allowed to vary as the vocabulary size is not necessarily a multiple of `P`. Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs with non-positive weight. For an entry with no features, the embedding vector for `default_id` is returned, or the 0-vector if `default_id` is not supplied. The ids and weights may be multi-dimensional. Embeddings are always aggregated along the last dimension. Args: embedding_weights: A list of `P` float tensors or values representing partitioned embedding tensors. The total unpartitioned shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size and `e_1, ..., e_m` are the embedding dimensions. sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the ids. `d_0` is typically batch size. sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing float weights corresponding to `sparse_ids`, or `None` if all weights are be assumed to be 1.0. combiner: A string specifying how to combine embedding results for each entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the default. default_id: The id to use for an entry with no features. name: A name for this operation (optional). partition_strategy: A string specifying the partitioning strategy. Currently `"div"` and `"mod"` are supported. Default is `"div"`. Returns: Dense tensor of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. Raises: ValueError: if `embedding_weights` is empty. """ if embedding_weights is None or len(embedding_weights) < 1: raise ValueError("Missing embedding_weights %s." % embedding_weights) dtype = sparse_weights.dtype if sparse_weights is not None else None embedding_weights = [ ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights ] contrib_tensor_util.assert_same_float_dtype(embedding_weights + [sparse_weights]) with ops.op_scope(embedding_weights + [sparse_ids, sparse_weights], name, "embedding_lookup") as scope: # Reshape higher-rank sparse ids and weights to linear segment ids. original_shape = sparse_ids.shape original_rank_dim = sparse_ids.shape.get_shape()[0] original_rank = ( array_ops.size(original_shape) if original_rank_dim.value is None else original_rank_dim.value) sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [ math_ops.reduce_prod( array_ops.slice(original_shape, [0], [original_rank - 1])), array_ops.gather(original_shape, original_rank - 1)]) if sparse_weights is not None: sparse_weights = ops.SparseTensor(sparse_ids.indices, sparse_weights.values, sparse_ids.shape) # Prune invalid ids and weights. sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights) # Fill in dummy values for empty features, if necessary. sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows(sparse_ids, default_id or 0) if sparse_weights is not None: sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0) result = tf_embedding_ops.embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, combiner=combiner, partition_strategy=partition_strategy, name=None if default_id is None else scope) if default_id is None: # Broadcast is_row_empty to the same shape as embedding_lookup_result, # for use in Select. is_row_empty = array_ops.tile( array_ops.reshape(is_row_empty, [-1, 1]), array_ops.pack([1, array_ops.shape(result)[1]])) result = math_ops.select(is_row_empty, array_ops.zeros_like(result), result, name=scope) # Reshape back from linear ids back into higher-dimensional dense result. final_result = array_ops.reshape(result, array_ops.concat(0, [ array_ops.slice( math_ops.cast(original_shape, dtypes.int32), [0], [original_rank - 1]), array_ops.slice(array_ops.shape(result), [1], [-1])])) final_result.set_shape(tensor_shape.unknown_shape( (original_rank_dim - 1).value).concatenate(result.get_shape()[1:])) return final_result
def _apply_gradient(self, grad, var, indices=None): """The main function to update a variable. Args: grad: A Tensor containing gradient to apply. var: A Tensor containing the variable to update. indices: An array of integers, for sparse update. Returns: Updated variable var = var - learning_rate * preconditioner * grad If the gradient is dense, var and grad have the same shape. If the update is sparse, then the first dimension of the gradient and var may differ, others are all the same. In this case the indices array provides the set of indices of the variable which are to be updated with each row of the gradient. """ global_step = self._global_step + 1 # Update accumulated weighted average of gradients gbar = self.get_slot(var, "gbar") gbar_decay_t = GetParam(self._gbar_decay, global_step) gbar_weight_t = GetParam(self._gbar_weight, global_step) if indices is not None: # Note - the sparse update is not easily implemented, since the # algorithm needs all indices of gbar to be updated # if mat_gbar_decay != 1 or mat_gbar_decay != 0. # One way to make mat_gbar_decay = 1 is by rescaling. # If we want the update: # G_{t+1} = a_{t+1} G_t + b_{t+1} w_t # define: # r_{t+1} = a_{t+1} * r_t # h_t = G_t / r_t # Then: # h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t # So we get the mat_gbar_decay = 1 as desired. # We can implement this in a future version as needed. # However we still need gbar_decay = 0, otherwise all indices # of the variable will need to be updated. if self._gbar_decay != 0.0: tf_logging.warning("Not applying momentum for variable: %s" % var.name) gbar_updated = grad else: gbar_updated = self._weighted_average(gbar, self._gbar_decay, gbar_decay_t, gbar_weight_t * grad) # Update the preconditioners and compute the preconditioned gradient shape = var.get_shape() mat_g_list = [] for i in range(len(shape)): mat_g_list.append(self.get_slot(var, "Gbar_" + str(i))) mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step) mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step) preconditioned_grad = gbar_updated v_rank = len(mat_g_list) neg_alpha = - GetParam(self._alpha, global_step) / v_rank svd_interval = GetParam(self._svd_interval, global_step) precond_update_interval = GetParam(self._precond_update_interval, global_step) for i, mat_g in enumerate(mat_g_list): # axes is the list of indices to reduce - everything but the current i. axes = list(range(i)) + list(range(i+1, v_rank)) if shape[i] <= self._max_matrix_size: # If the tensor size is sufficiently small perform full Shampoo update # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this # is not strictly correct. However we will use it for now, and # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg) # pylint: disable=g-long-lambda,cell-var-from-loop mat_g_updated = control_flow_ops.cond( math_ops.mod(global_step, precond_update_interval) < 1, lambda: self._update_mat_g( mat_g, grad, axes, mat_gbar_decay_t, mat_gbar_weight_t * precond_update_interval, i), lambda: mat_g) mat_g_updated = mat_g_updated / float(shape[i].value) if self._svd_interval == 1: mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha) else: mat_h = control_flow_ops.cond( math_ops.mod(global_step, svd_interval) < 1, lambda: self._compute_power(var, mat_g_updated, shape[i], neg_alpha, "H_" + str(i)), lambda: self.get_slot(var, "H_" + str(i))) # mat_h is a square matrix of size d_i x d_i # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor # After contraction with a d_i x d_i tensor # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor # (the first dimension is contracted out, and the second dimension of # mat_h is appended). After going through all the indices, it becomes # a d_0 x ... x d_n tensor again. preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h, axes=([0], [0]), name="precond_" + str(i)) else: # Tensor size is too large -- perform diagonal Shampoo update # Only normalize non-vector cases. if axes: normalizer = 1.0 if indices is not None else float(shape[i].value) grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) / normalizer else: grad_outer = grad * grad if i == 0 and indices is not None: assert self._mat_gbar_decay == 1.0 mat_g_updated = state_ops.scatter_add(mat_g, indices, mat_gbar_weight_t * grad_outer) mat_g_updated_slice = array_ops.gather(mat_g_updated, indices) mat_h = array_ops.where( math_ops.greater(mat_g_updated_slice, 0), math_ops.pow(mat_g_updated_slice, neg_alpha), array_ops.zeros_like(mat_g_updated_slice)) else: mat_g_updated = self._weighted_average(mat_g, self._mat_gbar_decay, mat_gbar_decay_t, mat_gbar_weight_t * grad_outer) mat_h = array_ops.where( math_ops.greater(mat_g_updated, 0), math_ops.pow(mat_g_updated, neg_alpha), array_ops.zeros_like(mat_g_updated)) # Need to do the transpose to ensure that the tensor becomes # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above. preconditioned_grad = array_ops.transpose( preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h # Update the variable based on the Shampoo update learning_rate_t = GetParam(self._learning_rate, global_step) if indices is not None: var_updated = state_ops.scatter_add( var, indices, -learning_rate_t * preconditioned_grad) else: var_updated = state_ops.assign_sub(var, learning_rate_t * preconditioned_grad) return var_updated
def TaylorExp(logits, labels, perturbW): """You can also add L2Loss to all the trainable variables. Add summary for "Loss" and "Loss/avg". Args: logits: Logits from inference(). labels: Labels from distorted_inputs or inputs(). Returns: Loss tensor of type float. """ # Calculate the average cross entropy loss across the batch. labels = tf.cast(labels, tf.float32) # Differentially private sparse cross entropy error based on Taylor Expansion ''' Computes differentially private sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For brevity, let `x = logits`, `z = labels`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(x) = -abs(h * W). By Applying Taylor Expansion, we have: Taylor = max(x, 0) - x * z + log(1 + exp(-abs(x))); = max(h * W, 0) - (z * h) * W + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) = max(h * W, 0) - (z * h) * W + (math.log(2.0) + 0.5*(-abs(h * W)) + 1.0/8.0*(-abs(h * W))**2) = F1 + F2 where: F1 = max(h * W, 0) + (math.log(2.0) + 0.5*(-abs(h * W)) + 1.0/8.0*(-abs(h * W))**2) and F2 = - (z * h) * W To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the terms h * W, z * h * W. Note that h is differentially private, since its computation on top of the DP Affine transformation does not access the original data. Therefore, F1 should be differentially private. We need to preserve DP in F2, which reads the groundtruth label z, as follows: By applying Funtional Mechanism, we perturb (z * h) * W as ((z * h) + perturbW) * W = (z * h) * W + (perturbW * W): perturbW = np.random.laplace(0.0, scale3, hk * 10) perturbW = np.reshape(perturbW, [hk, 10]); where scale3 = Delta3/(epsilon3*L) = 10*(hk + 1/4 * hk**2)/(epsilon3*L); (Lemma 5) To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow]. Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow ''' zeros = array_ops.zeros_like(logits, dtype=logits.dtype) cond = (logits >= zeros) relu_logits = array_ops.where(cond, logits, zeros) neg_abs_logits = array_ops.where(cond, -logits, logits) #Taylor = math_ops.add(relu_logits - y_conv * y_, math_ops.log1p(math_ops.exp(neg_abs_logits))) Taylor = math_ops.add( relu_logits - logits * labels, math.log(2.0) + 0.5 * neg_abs_logits + 1.0 / 8.0 * neg_abs_logits**2) zeros1 = array_ops.zeros_like(perturbW, dtype=perturbW.dtype) cond1 = (perturbW >= zeros1) perturbW = array_ops.where(cond1, perturbW, -perturbW) cross_entropy_mean = tf.reduce_mean( Taylor, name='cross_entropy') + tf.reduce_mean(perturbW, name='perturbW') tf.add_to_collection('losses', cross_entropy_mean) return tf.add_n(tf.get_collection('losses'), name='total_loss')
def call(self, inputs, training=None): if training is None: training = keras.backend.learning_phase() return tf_utils.smart_cond(training, lambda: array_ops.ones_like(inputs), lambda: array_ops.zeros_like(inputs))
def call(self, inputs): return keras.backend.in_train_phase( lambda: array_ops.ones_like(inputs), lambda: array_ops.zeros_like(inputs))
def random_gamma(shape, alpha, beta=None, dtype=dtypes.float32, seed=None, name=None): """Draws `shape` samples from each of the given Gamma distribution(s). `alpha` is the shape parameter describing the distribution(s), and `beta` is the inverse scale parameter(s). Note: Because internal calculations are done using `float64` and casting has `floor` semantics, we must manually map zero outcomes to the smallest possible positive floating-point value, i.e., `np.finfo(dtype).tiny`. This means that `np.finfo(dtype).tiny` occurs more frequently than it otherwise should. This bias can only happen for small values of `alpha`, i.e., `alpha << 1` or large values of `beta`, i.e., `beta >> 1`. The samples are differentiable w.r.t. alpha and beta. The derivatives are computed using the approach described in the paper [Michael Figurnov, Shakir Mohamed, Andriy Mnih. Implicit Reparameterization Gradients, 2018](https://arxiv.org/abs/1805.08498) Example: ```python samples = tf.random_gamma([10], [0.5, 1.5]) # samples has shape [10, 2], where each slice [:, 0] and [:, 1] represents # the samples drawn from each distribution samples = tf.random_gamma([7, 5], [0.5, 1.5]) # samples has shape [7, 5, 2], where each slice [:, :, 0] and [:, :, 1] # represents the 7x5 samples drawn from each of the two distributions alpha = tf.constant([[1.],[3.],[5.]]) beta = tf.constant([[3., 4.]]) samples = tf.random_gamma([30], alpha=alpha, beta=beta) # samples has shape [30, 3, 2], with 30 samples each of 3x2 distributions. loss = tf.reduce_mean(tf.square(samples)) dloss_dalpha, dloss_dbeta = tf.gradients(loss, [alpha, beta]) # unbiased stochastic derivatives of the loss function alpha.shape == dloss_dalpha.shape # True beta.shape == dloss_dbeta.shape # True ``` Args: shape: A 1-D integer Tensor or Python array. The shape of the output samples to be drawn per alpha/beta-parameterized distribution. alpha: A Tensor or Python value or N-D array of type `dtype`. `alpha` provides the shape parameter(s) describing the gamma distribution(s) to sample. Must be broadcastable with `beta`. beta: A Tensor or Python value or N-D array of type `dtype`. Defaults to 1. `beta` provides the inverse scale parameter(s) of the gamma distribution(s) to sample. Must be broadcastable with `alpha`. dtype: The type of alpha, beta, and the output: `float16`, `float32`, or `float64`. seed: A Python integer. Used to create a random seed for the distributions. See `tf.set_random_seed` for behavior. name: Optional name for the operation. Returns: samples: a `Tensor` of shape `tf.concat([shape, tf.shape(alpha + beta)], axis=0)` with values of type `dtype`. """ with ops.name_scope(name, "random_gamma", [shape, alpha, beta]): shape = ops.convert_to_tensor(shape, name="shape", dtype=dtypes.int32) alpha = ops.convert_to_tensor(alpha, name="alpha", dtype=dtype) beta = ops.convert_to_tensor(beta if beta is not None else 1, name="beta", dtype=dtype) alpha_broadcast = alpha + array_ops.zeros_like(beta) seed1, seed2 = random_seed.get_seed(seed) return math_ops.maximum( np.finfo(dtype.as_numpy_dtype).tiny, gen_random_ops.random_gamma( shape, alpha_broadcast, seed=seed1, seed2=seed2) / beta)
def mean_pairwise_squared_error(predictions, labels=None, weights=1.0, scope=None): """Adds a pairwise-errors-squared loss to the training procedure. Unlike `mean_squared_error`, which is a measure of the differences between corresponding elements of `predictions` and `labels`, `mean_pairwise_squared_error` is a measure of the differences between pairs of corresponding elements of `predictions` and `labels`. For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are three pairs of differences are summed to compute the loss: loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3 Note that since the inputs are of size [batch_size, d0, ... dN], the corresponding pairs are computed within each batch sample but not across samples within a batch. For example, if `predictions` represents a batch of 16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs is drawn from each image, but not across images. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. Args: predictions: The predicted outputs, a tensor of size [batch_size, d0, .. dN] where N+1 is the total number of dimensions in `predictions`. labels: The ground truth output tensor, whose shape must match the shape of the `predictions` tensor. weights: Coefficients for the loss a scalar, a tensor of shape [batch_size] or a tensor whose shape matches `predictions`. scope: The scope for the operations performed in computing the loss. Returns: A scalar `Tensor` representing the loss value. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. """ with ops.name_scope(scope, "mean_pairwise_squared_error", [predictions, labels, weights]) as scope: predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) weights = math_ops.to_float(ops.convert_to_tensor(weights)) diffs = math_ops.subtract(predictions, labels) # Need to verify here since the function doesn't use compute_weighted_loss if diffs.get_shape().ndims is None: raise ValueError("diffs.get_shape().ndims cannot be None") if weights.get_shape().ndims is None: raise ValueError("weights.get_shape().ndims cannot be None") reduction_indices = list(range(1, diffs.get_shape().ndims)) sum_squares_diff_per_batch = math_ops.reduce_sum( math_ops.square(diffs), reduction_indices=reduction_indices) num_present_per_batch = _num_present(diffs, weights, per_batch=True) term1 = 2.0 * _safe_div(sum_squares_diff_per_batch, num_present_per_batch) sum_diff = math_ops.reduce_sum(diffs, reduction_indices=reduction_indices) term2 = 2.0 * _safe_div(math_ops.square(sum_diff), math_ops.square(num_present_per_batch)) loss = _scale_losses(term1 - term2, weights) mean_loss = array_ops.where( math_ops.reduce_sum(num_present_per_batch) > 0, loss, array_ops.zeros_like(loss), name="value") add_loss(mean_loss) return mean_loss
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for the singular value decomposition.""" # The derivation for the compute_uv=False case, and most of # the derivation for the full_matrices=True case, are in # Giles' paper (see reference at top of file). A derivation for # the full_matrices=False case is available at # https://j-towns.github.io/papers/svd-derivative.pdf a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True) grad_a = math_ops.matmul(u, math_ops.matmul(grad_s_mat, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a full_matrices = op.get_attr("full_matrices") # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape.dims[-2].merge_with(grad_u_shape[-2]) n = a_shape.dims[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape.dims[-2].value n = a_shape.dims[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): if full_matrices and abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1 " "when full_matrices is True") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) v1 = v[..., :, :m] grad_v1 = grad_v[..., :, :m] u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v1, grad_v1, adjoint_a=True) f_u = f * u_gu f_v = f * v_gv term1_nouv = ( grad_s_mat + math_ops.matmul(f_u + _linalg.adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _linalg.adjoint(f_v))) term1 = math_ops.matmul(u, math_ops.matmul(term1_nouv, v1, adjoint_b=True)) if m == n: grad_a_before_transpose = term1 else: gv1t = array_ops.matrix_transpose(grad_v1) gv1t_v1 = math_ops.matmul(gv1t, v1) term2_nous = gv1t - math_ops.matmul(gv1t_v1, v1, adjoint_b=True) if full_matrices: v2 = v[..., :, m:n] grad_v2 = grad_v[..., :, m:n] v1t_gv2 = math_ops.matmul(v1, grad_v2, adjoint_a=True) term2_nous -= math_ops.matmul(v1t_gv2, v2, adjoint_b=True) u_s_inv = math_ops.matmul(u, s_inv_mat) term2 = math_ops.matmul(u_s_inv, term2_nous) grad_a_before_transpose = term1 + term2 if use_adjoint: grad_a = array_ops.matrix_transpose(grad_a_before_transpose) else: grad_a = grad_a_before_transpose grad_a.set_shape(a_shape) return grad_a
def loop_fn(i): x1 = array_ops.gather(x, i) z = array_ops.zeros_like(x1), return z, z + x1
def _SvdGrad(op, grad_s, grad_u, grad_v): """Gradient for Svd based on Giles' algorithm. Reference at top of file.""" def _Adjoint(x): return math_ops.conj(array_ops.matrix_transpose(x)) if op.get_attr("compute_uv") and not op.get_attr("full_matrices"): raise NotImplementedError( "SVD gradient is not implemented for compute_uv=True and " "full_matrices=False.") a = op.inputs[0] a_shape = a.get_shape().with_rank_at_least(2) if op.get_attr("compute_uv"): # TODO(rmlarsen): Make this work with complex types. if a.dtype.is_complex: raise NotImplementedError( "SVD gradient is not implemented for complex types and " "compute_uv=True.") grad_u_shape = grad_u.get_shape().with_rank_at_least(2) grad_v_shape = grad_v.get_shape().with_rank_at_least(2) m = a_shape[-2].merge_with(grad_u_shape[-2]) n = a_shape[-1].merge_with(grad_v_shape[-2]) batch_shape = a_shape[:-2].merge_with(grad_u_shape[:-2]).merge_with( grad_v_shape[:-2]) a_shape = batch_shape.concatenate([m, n]) m = a_shape[-2].value n = a_shape[-1].value # TODO(rmlarsen): Make this work with placeholders. if m is None or n is None: raise NotImplementedError( "SVD gradient has not been implemented for input with unknown " "inner matrix shape.") if not op.get_attr("full_matrices") or not op.get_attr("compute_uv"): s, u, v = linalg_ops.svd(a, compute_uv=True, full_matrices=True) else: s = op.outputs[0] u = op.outputs[1] v = op.outputs[2] use_adjoint = False if m > n: # Compute the gradient for A^H = V * S^T * U^H, and (implicitly) take the # Hermitian transpose of the gradient at the end. use_adjoint = True m, n = n, m u, v = v, u grad_u, grad_v = grad_v, grad_u with ops.control_dependencies([grad_s, grad_u, grad_v]): grad_s_mat = array_ops.matrix_diag(grad_s) if not op.get_attr("compute_uv"): if use_adjoint: grad_a = math_ops.matmul(v[..., :, :m], math_ops.matmul(u, grad_s_mat), adjoint_b=True) else: grad_a = math_ops.matmul( u, math_ops.matmul(grad_s_mat, v[..., :, :m], adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a # TODO(rmlarsen): Define a gradient that is numerically stable for # abs(m-n) > 1. Currently this does not work because there are effectively # multiple singular values with value zero. I am not sure if this is a true # instability or if it simply throws off the finite difference gradient # checker. if abs(m - n) > 1: raise NotImplementedError( "svd gradient is not implemented for abs(m - n) > 1") s_mat = array_ops.matrix_diag(s) s2 = math_ops.square(s) # NOTICE: Because of the term involving f, the gradient becomes # infinite (or NaN in practice) when singular values are not unique. # Mathematically this should not be surprising, since for (k-fold) # degenerate singular values, the corresponding singular vectors are # only defined up a (k-dimensional) subspace. In practice, this can # lead to numerical instability when singular values are close but not # exactly equal. f = array_ops.matrix_set_diag( math_ops.reciprocal( array_ops.expand_dims(s2, -2) - array_ops.expand_dims(s2, -1)), array_ops.zeros_like(s)) s_inv_mat = array_ops.matrix_diag(math_ops.reciprocal(s)) u_gu = math_ops.matmul(u, grad_u, adjoint_a=True) v_gv = math_ops.matmul(v, grad_v, adjoint_a=True) if m == n: f_u = f * u_gu f_v = f * v_gv else: dv2 = array_ops.matrix_transpose( v_gv[..., m:n, :m]) - v_gv[..., :m, m:n] f_u = f * u_gu f_v = f * v_gv[..., :m, :m] grad_a_nouv = (grad_s_mat + math_ops.matmul(f_u + _Adjoint(f_u), s_mat) + math_ops.matmul(s_mat, f_v + _Adjoint(f_v))) if m != n: grad_a_nouv = array_ops.concat( [grad_a_nouv, math_ops.matmul(s_inv_mat, dv2)], -1) if use_adjoint: # Use (U X V^H)^H = V (U X)^H. grad_a = math_ops.matmul(v, math_ops.matmul(u, grad_a_nouv), adjoint_b=True) else: grad_a = math_ops.matmul( u, math_ops.matmul(grad_a_nouv, v, adjoint_b=True)) grad_a.set_shape(a_shape) return grad_a
def dynamic_decode(decoder, output_time_major=False, impute_finished=False, maximum_iterations=None, parallel_iterations=32, swap_memory=False, scope=None): """Perform dynamic decoding with `decoder`. Calls initialize() once and step() repeatedly on the Decoder object. Args: decoder: A `Decoder` instance. output_time_major: Python boolean. Default: `False` (batch major). If `True`, outputs are returned as time major tensors (this mode is faster). Otherwise, outputs are returned as batch major tensors (this adds extra time to the computation). impute_finished: Python boolean. If `True`, then states for batch entries which are marked as finished get copied through and the corresponding outputs get zeroed out. This causes some slowdown at each time step, but ensures that the final state and outputs have the correct values and that backprop ignores time steps that were marked as finished. maximum_iterations: `int32` scalar, maximum allowed number of decoding steps. Default is `None` (decode until the decoder is fully done). parallel_iterations: Argument passed to `tf.while_loop`. swap_memory: Argument passed to `tf.while_loop`. scope: Optional variable scope to use. Returns: `(final_outputs, final_state, final_sequence_lengths)`. Raises: TypeError: if `decoder` is not an instance of `Decoder`. ValueError: if `maximum_iterations` is provided but is not a scalar. """ if not isinstance(decoder, tf.contrib.seq2seq.Decoder): raise TypeError("Expected decoder to be type Decoder, but saw: %s" % type(decoder)) with variable_scope.variable_scope(scope, "decoder") as varscope: # Properly cache variable values inside the while_loop if varscope.caching_device is None: varscope.set_caching_device(lambda op: op.device) if maximum_iterations is not None: maximum_iterations = ops.convert_to_tensor( maximum_iterations, dtype=dtypes.int32, name="maximum_iterations") if maximum_iterations.get_shape().ndims != 0: raise ValueError("maximum_iterations must be a scalar") initial_finished, initial_inputs, initial_state, hist = decoder.initialize() zero_outputs = _create_zero_outputs(decoder.output_size, decoder.output_dtype, decoder.batch_size) if maximum_iterations is not None: initial_finished = math_ops.logical_or( initial_finished, 0 >= maximum_iterations) initial_sequence_lengths = array_ops.zeros_like( initial_finished, dtype=dtypes.int32) initial_time = constant_op.constant(0, dtype=dtypes.int32) def _shape(batch_size, from_shape): if not isinstance(from_shape, tensor_shape.TensorShape): return tensor_shape.TensorShape(None) else: batch_size = tensor_util.constant_value( ops.convert_to_tensor( batch_size, name="batch_size")) return tensor_shape.TensorShape([batch_size]).concatenate(from_shape) def _create_ta(s, d): return tensor_array_ops.TensorArray( dtype=d, size=0, dynamic_size=True, element_shape=_shape(decoder.batch_size, s)) initial_outputs_ta = nest.map_structure(_create_ta, decoder.output_size, decoder.output_dtype) def condition(unused_time, unused_outputs_ta, unused_state, unused_inputs, finished, unused_sequence_lengths, unused_hist): return math_ops.logical_not(math_ops.reduce_all(finished)) def body(time, outputs_ta, state, inputs, finished, sequence_lengths, hist): """Internal while_loop body. Args: time: scalar int32 tensor. outputs_ta: structure of TensorArray. state: (structure of) state tensors and TensorArrays. inputs: (structure of) input tensors. finished: bool tensor (keeping track of what's finished). sequence_lengths: int32 tensor (keeping track of time of finish). Returns: `(time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths)`. ``` """ (next_outputs, decoder_state, next_inputs, decoder_finished, hist) = decoder.step(time, inputs, state, hist) if decoder.tracks_own_finished: next_finished = decoder_finished else: next_finished = math_ops.logical_or(decoder_finished, finished) if maximum_iterations is not None: next_finished = math_ops.logical_or( next_finished, time + 1 >= maximum_iterations) next_sequence_lengths = array_ops.where( math_ops.logical_and(math_ops.logical_not(finished), next_finished), array_ops.fill(array_ops.shape(sequence_lengths), time + 1), sequence_lengths) nest.assert_same_structure(state, decoder_state) nest.assert_same_structure(outputs_ta, next_outputs) nest.assert_same_structure(inputs, next_inputs) # Zero out output values past finish if impute_finished: emit = nest.map_structure( lambda out, zero: array_ops.where(finished, zero, out), next_outputs, zero_outputs) else: emit = next_outputs # Copy through states past finish def _maybe_copy_state(new, cur): # TensorArrays and scalar states get passed through. if isinstance(cur, tensor_array_ops.TensorArray): pass_through = True else: new.set_shape(cur.shape) pass_through = (new.shape.ndims == 0) return new if pass_through else array_ops.where(finished, cur, new) if impute_finished: next_state = nest.map_structure( _maybe_copy_state, decoder_state, state) else: next_state = decoder_state outputs_ta = nest.map_structure(lambda ta, out: ta.write(time, out), outputs_ta, emit) return (time + 1, outputs_ta, next_state, next_inputs, next_finished, next_sequence_lengths, hist) res = control_flow_ops.while_loop( condition, body, loop_vars=[ initial_time, initial_outputs_ta, initial_state, initial_inputs, initial_finished, initial_sequence_lengths, hist ], parallel_iterations=parallel_iterations, swap_memory=swap_memory) final_outputs_ta = res[1] final_state = res[2] final_sequence_lengths = res[5] final_outputs = nest.map_structure(lambda ta: ta.stack(), final_outputs_ta) try: final_outputs, final_state = decoder.finalize( final_outputs, final_state, final_sequence_lengths) except NotImplementedError: pass if not output_time_major: final_outputs = nest.map_structure(_transpose_batch_time, final_outputs) return final_outputs, final_state, final_sequence_lengths
def histogram_fixed_width(values, value_range, nbins=100, use_locking=True, dtype=dtypes.int32, name=None): """Return histogram of values. Given the tensor `values`, this operation returns a rank 1 histogram counting the number of entries in `values` that fell into every bin. The bins are equal width and determined by the arguments `value_range` and `nbins`. Args: values: Numeric `Tensor`. value_range: Shape [2] `Tensor`. new_values <= value_range[0] will be mapped to hist[0], values >= value_range[1] will be mapped to hist[-1]. Must be same dtype as new_values. nbins: Integer number of bins in this histogram. use_locking: Boolean. If `True`, use locking during the operation (optional). dtype: dtype for returned histogram. name: A name for this operation (defaults to 'histogram_fixed_width'). Returns: A `Variable` holding histogram of values. Examples: ```python # Bins will be: (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf) nbins = 5 value_range = [0.0, 5.0] new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15] with tf.default_session() as sess: hist = tf.histogram_fixed_width(new_values, value_range, nbins=5) variables.initialize_all_variables().run() sess.run(hist) => [2, 1, 1, 0, 2] ``` """ with variable_scope.variable_op_scope([values, value_range], name, 'histogram_fixed_width') as scope: values = ops.convert_to_tensor(values, name='values') values = array_ops.reshape(values, [-1]) value_range = ops.convert_to_tensor(value_range, name='value_range') # Map tensor values that fall within value_range to [0, 1]. scaled_values = math_ops.truediv(values - value_range[0], value_range[1] - value_range[0], name='scaled_values') # map tensor values within the open interval value_range to {0,.., nbins-1}, # values outside the open interval will be zero or less, or nbins or more. indices = math_ops.floor(nbins * scaled_values, name='indices') # Clip edge cases (e.g. value = value_range[1]) or "outliers." indices = math_ops.cast(clip_ops.clip_by_value(indices, 0, nbins - 1), dtypes.int32) # Dummy vector to scatter. # TODO(langmore) Replace non-ideal creation of large dummy vector once an # alternative to scatter is available. updates = array_ops.ones_like(indices, dtype=dtype) hist = variable_scope.get_variable( 'hist', initializer=array_ops.zeros_initializer([nbins], dtype=dtype), trainable=False) hist_assign_zero = hist.assign(array_ops.zeros_like(hist)) with ops.control_dependencies([hist_assign_zero]): return state_ops.scatter_add(hist, indices, updates, use_locking=use_locking, name=scope.name)
def sigmoid_cross_entropy_with_logits(logits, targets, name=None): """Computes sigmoid cross entropy given `logits`. Measures the probability error in discrete classification tasks in which each class is independent and not mutually exclusive. For instance, one could perform multilabel classification where a picture can contain both an elephant and a dog at the same time. For brevity, let `x = logits`, `z = targets`. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `targets` must have the same type and shape. Args: logits: A `Tensor` of type `float32` or `float64`. targets: A `Tensor` of the same type and shape as `logits`. name: A name for the operation (optional). Returns: A `Tensor` of the same shape as `logits` with the componentwise logistic losses. Raises: ValueError: If `logits` and `targets` do not have the same shape. """ with ops.name_scope(name, "logistic_loss", [logits, targets]) as name: logits = ops.convert_to_tensor(logits, name="logits") targets = ops.convert_to_tensor(targets, name="targets") try: targets.get_shape().merge_with(logits.get_shape()) except ValueError: raise ValueError( "logits and targets must have the same shape (%s vs %s)" % (logits.get_shape(), targets.get_shape())) # The logistic loss formula from above is # x - x * z + log(1 + exp(-x)) # For x < 0, a more numerically stable formula is # -x * z + log(1 + exp(x)) # Note that these two expressions can be combined into the following: # max(x, 0) - x * z + log(1 + exp(-abs(x))) # To allow computing gradients at zero, we define custom versions of max and # abs functions. zeros = array_ops.zeros_like(logits, dtype=logits.dtype) cond = (logits >= zeros) relu_logits = math_ops.select(cond, logits, zeros) neg_abs_logits = math_ops.select(cond, -logits, logits) return math_ops.add(relu_logits - logits * targets, math_ops.log1p(math_ops.exp(neg_abs_logits)), name=name)
def _ComplexAbsGrad(op, grad): """Returns the gradient of ComplexAbs.""" # TODO(b/27786104): The cast to complex could be removed once arithmetic # supports mixtures of complex64 and real values. return (math_ops.complex(grad, array_ops.zeros_like(grad)) * math_ops.sign(op.inputs[0]))
def call(self, inputs, training=None): original_training_value = training if training is None: training = K.learning_phase() in_eager_mode = context.executing_eagerly() if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching outputs = undo_virtual_batching(outputs) if not context.executing_eagerly( ) and original_training_value is None: outputs._uses_learning_phase = True # pylint: disable=protected-access return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.get_shape() ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.get_shape()) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = tf_utils.constant_value(training) if training_value is not False: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = tf_utils.smart_cond( training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = tf_utils.smart_cond( training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len( self.axis) > 1 mean, variance = nn.moments(inputs, reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = tf_utils.smart_cond(training, lambda: mean, lambda: moving_mean) variance = tf_utils.smart_cond(training, lambda: variance, lambda: moving_variance) if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( mean, variance, training) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) else: new_mean, new_variance = mean, variance if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(new_mean, axis=1, keepdims=True) new_variance = math_ops.reduce_mean(new_variance, axis=1, keepdims=True) def _do_update(var, value): if in_eager_mode and not self.trainable: return return self._assign_moving_average(var, value, self.momentum) mean_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_mean, new_mean), lambda: self.moving_mean) variance_update = tf_utils.smart_cond( training, lambda: _do_update(self.moving_variance, new_variance), lambda: self.moving_variance) if not context.executing_eagerly(): self.add_update(mean_update, inputs=True) self.add_update(variance_update, inputs=True) else: mean, variance = self.moving_mean, self.moving_variance outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: outputs = undo_virtual_batching(outputs) if not context.executing_eagerly() and original_training_value is None: outputs._uses_learning_phase = True # pylint: disable=protected-access return outputs
def _compute_sampled_logits(weights, biases, inputs, labels, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None): if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "compute_sampled_logits", weights + [biases, inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = sampled_values # pylint: enable=unpacking-non-sequence # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat(0, [labels_flat, sampled]) # weights shape is [num_classes, dim] all_w = embedding_ops.embedding_lookup( weights, all_ids, partition_strategy=partition_strategy) all_b = embedding_ops.embedding_lookup(biases, all_ids) # true_w shape is [batch_size * num_true, dim] # true_b is a [batch_size * num_true] tensor true_w = array_ops.slice(all_w, [0, 0], array_ops.pack( [array_ops.shape(labels_flat)[0], -1])) # 128*128 true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat)) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat(0, [[-1, num_true], dim]) row_wise_dots = math_ops.mul( array_ops.expand_dims(inputs, 1), # 128*1*128 array_ops.reshape(true_w, new_true_w_shape)) # 128*1*128 # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat(0, [[-1], dim])) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b # Lookup weights and biases for sampled labels. # sampled_w shape is [num_sampled, dim] # sampled_b is a [num_sampled] float tensor sampled_w = array_ops.slice( all_w, array_ops.pack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # sampled_b has shape [num_sampled] # Apply X*W'+B, which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul(inputs, sampled_w, transpose_b=True) + sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat( 1, [acc_indices_2d, acc_ids_2d_int32], "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat(0, [ array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0) ]) if sampled_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat(1, [true_logits, sampled_logits]) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat(1, [ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ]) return out_logits, out_labels
def get_dp_train_ops(self, epsilon, data_size, learning_rate=0.1): # compute Laplace noise injected into coefficients h Delta = self.n_in * (self.n_out + 1 / 4 * self.n_out**2) perturbFM = np.random.laplace(0.0, Delta / (epsilon * data_size), self.n_out) perturbFM = np.reshape(perturbFM, [self.n_out]) # compute h terms activation_h = self.propup(self.input) + perturbFM # reconstruct v terms activation_v = self.propdown(activation_h) ''' Computes sigmoid cross entropy given `logits`, i.e., x = logits = activation_v, and z = self.input = labels. Measures the probability error in discrete classification tasks in which each class, i.e., each visible neuron, is independent and not mutually exclusive. The logistic loss is z * -log(sigmoid(x)) + (1 - z) * -log(1 - sigmoid(x)) = z * -log(1 / (1 + exp(-x))) + (1 - z) * -log(exp(-x) / (1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (-log(exp(-x)) + log(1 + exp(-x))) = z * log(1 + exp(-x)) + (1 - z) * (x + log(1 + exp(-x)) = (1 - z) * x + log(1 + exp(-x)) = x - x * z + log(1 + exp(-x)) For x < 0, to avoid overflow in exp(-x), we reformulate the above x - x * z + log(1 + exp(-x)) = log(exp(x)) - x * z + log(1 + exp(-x)) = - x * z + log(1 + exp(x)) Hence, to ensure stability and avoid overflow, the implementation uses this equivalent formulation max(x, 0) - x * z + log(1 + exp(-abs(x))) `logits` and `labels` must have the same type and shape. Let denote neg_abs_logits = -abs(activation_v) = -abs(W^T * activation_h). By Applying Taylor Expansion, we have: Taylor = max(activation_v, 0) - activation_v * self.input + log(1 + exp(-abs(activation_v))); = max(activation_h * W^T, 0) - self.input * (activation_h * W^T) + (math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) = max(activation_h * W^T, 0) - self.input * (activation_h * W^T) + (math.log(2.0) + 0.5*(-abs(activation_h * W^T)) + 1.0/8.0*(-abs(activation_h * W^T))**2) To ensure that Taylor is differentially private, we need to perturb all the coefficients, including the terms activation_h * W^T, self.input * (W^T * activation_h). Since 'self.input *' is an element-wise multiplication, activation_h can be considered coefficients of the term self.input * (W^T * activation_h). By applying Funtional Mechanism, we perturb self.input * (W^T * activation_h) as tf.matmul(activation_h + perturbFM, W^T) * self.input: activation_h += perturbFM; # activation_h = self.propup(self.input) + perturbFM; # (Lemma 2) where Delta = self.n_in*(self.n_out + 1/4 * self.n_out**2); perturbFM = np.random.laplace(0.0, Delta/(epsilon*data_size), self.n_out) perturbFM = np.reshape(perturbFM, [self.n_out]); To allow computing gradients at zero, we define custom versions of max and abs functions [Tensorflow]. Source: https://github.com/tensorflow/tensorflow/blob/r1.4/tensorflow/python/ops/nn_impl.py @ TensorFlow ''' zeros = array_ops.zeros_like(activation_v, dtype=self.input.dtype) cond = (activation_v >= zeros) relu_logits = array_ops.where(cond, activation_v, zeros) neg_abs_logits = array_ops.where(cond, -activation_v, activation_v) #Taylor = math_ops.add(relu_logits - activation_v * self.input, math.log(2.0) + 0.5*neg_abs_logits + 1.0/8.0*neg_abs_logits**2) self.cost = math_ops.add( relu_logits - activation_v * self.input, math.log(2.0) + 0.5 * neg_abs_logits + 1.0 / 8.0 * neg_abs_logits**2) optimizer = tf.train.AdamOptimizer(learning_rate).minimize( self.cost, var_list=self.params) return optimizer
def safe_embedding_lookup_sparse(embedding_weights, sparse_ids, sparse_weights=None, combiner="mean", default_id=None, name=None, partition_strategy="div", max_norm=None): """Lookup embedding results, accounting for invalid IDs and empty features. The partitioned embedding in `embedding_weights` must all be the same shape except for the first dimension. The first dimension is allowed to vary as the vocabulary size is not necessarily a multiple of `P`. `embedding_weights` may be a `PartitionedVariable` as returned by using `tf.compat.v1.get_variable()` with a partitioner. Invalid IDs (< 0) are pruned from input IDs and weights, as well as any IDs with non-positive weight. For an entry with no features, the embedding vector for `default_id` is returned, or the 0-vector if `default_id` is not supplied. The ids and weights may be multi-dimensional. Embeddings are always aggregated along the last dimension. Args: embedding_weights: A list of `P` float `Tensor`s or values representing partitioned embedding `Tensor`s. Alternatively, a `PartitionedVariable` created by partitioning along dimension 0. The total unpartitioned shape should be `[e_0, e_1, ..., e_m]`, where `e_0` represents the vocab size and `e_1, ..., e_m` are the embedding dimensions. sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the ids. `d_0` is typically batch size. sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing float weights corresponding to `sparse_ids`, or `None` if all weights are be assumed to be 1.0. combiner: A string specifying how to combine embedding results for each entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the default. default_id: The id to use for an entry with no features. name: A name for this operation (optional). partition_strategy: A string specifying the partitioning strategy. Currently `"div"` and `"mod"` are supported. Default is `"div"`. max_norm: If not `None`, all embeddings are l2-normalized to max_norm before combining. Returns: Dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. Raises: ValueError: if `embedding_weights` is empty. """ if embedding_weights is None: raise ValueError("Missing embedding_weights %s." % embedding_weights) if isinstance(embedding_weights, variables.PartitionedVariable): embedding_weights = list(embedding_weights) # get underlying Variables. if not isinstance(embedding_weights, list): embedding_weights = [embedding_weights] if len(embedding_weights) < 1: raise ValueError("Missing embedding_weights %s." % embedding_weights) dtype = sparse_weights.dtype if sparse_weights is not None else None embedding_weights = [ w if (isinstance(w, resource_variable_ops.ResourceVariable) and dtype in (None, w.dtype)) else ops.convert_to_tensor(w, dtype=dtype) for w in embedding_weights ] with ops.name_scope(name, "embedding_lookup", embedding_weights + [sparse_ids, sparse_weights]) as scope: # Reshape higher-rank sparse ids and weights to linear segment ids. original_shape = sparse_ids.dense_shape original_rank_dim = tensor_shape.dimension_value( sparse_ids.dense_shape.get_shape()[0]) original_rank = ( array_ops.size(original_shape) if original_rank_dim is None else original_rank_dim) sparse_ids = sparse_ops.sparse_reshape(sparse_ids, [ math_ops.reduce_prod( array_ops.slice(original_shape, [0], [original_rank - 1])), array_ops.gather(original_shape, original_rank - 1) ]) if sparse_weights is not None: sparse_weights = sparse_tensor.SparseTensor(sparse_ids.indices, sparse_weights.values, sparse_ids.dense_shape) # Prune invalid ids and weights. sparse_ids, sparse_weights = _prune_invalid_ids(sparse_ids, sparse_weights) if combiner != "sum": sparse_ids, sparse_weights = _prune_invalid_weights( sparse_ids, sparse_weights) # Fill in dummy values for empty features, if necessary. sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows( sparse_ids, default_id or 0) if sparse_weights is not None: sparse_weights, _ = sparse_ops.sparse_fill_empty_rows(sparse_weights, 1.0) result = embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, combiner=combiner, partition_strategy=partition_strategy, name=None if default_id is None else scope, max_norm=max_norm) if default_id is None: # Broadcast is_row_empty to the same shape as embedding_lookup_result, # for use in Select. is_row_empty = array_ops.tile( array_ops.reshape(is_row_empty, [-1, 1]), array_ops.stack([1, array_ops.shape(result)[1]])) result = array_ops.where( is_row_empty, array_ops.zeros_like(result), result, name=scope) # Reshape back from linear ids back into higher-dimensional dense result. final_result = array_ops.reshape( result, array_ops.concat([ array_ops.slice( math_ops.cast(original_shape, dtypes.int32), [0], [original_rank - 1]), array_ops.slice(array_ops.shape(result), [1], [-1]) ], 0)) final_result.set_shape( tensor_shape.unknown_shape( (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate( result.get_shape()[1:])) return final_result
def batch_matrix_pow(matrices, powers): """Compute powers of matrices, e.g. A^3 = matmul(matmul(A, A), A). Uses exponentiation by squaring, with O(log(p)) matrix multiplications to compute A^p. Args: matrices: [batch size x N x N] powers: Which integer power to raise each matrix to [batch size] Returns: The matrices raised to their respective powers, same dimensions as the "matrices" argument. """ def terminate_when_all_zero(current_argument, residual_powers, accumulator): del current_argument, accumulator # not used for condition do_exit = math_ops.reduce_any( math_ops.greater(residual_powers, array_ops.ones_like(residual_powers))) return do_exit def do_iteration(current_argument, residual_powers, accumulator): """Compute one step of iterative exponentiation by squaring. The recursive form is: power(A, p) = { power(matmul(A, A), p / 2) for even p { matmul(A, power(matmul(A, A), (p - 1) / 2)) for odd p power(A, 0) = I The power(A, 0) = I case is handled by starting with accumulator set to the identity matrix; matrices with zero residual powers are passed through unchanged. Args: current_argument: On this step, what is the first argument (A^2..^2) to the (unrolled) recursive function? [batch size x N x N] residual_powers: On this step, what is the second argument (residual p)? [batch_size] accumulator: Accumulates the exterior multiplications from the odd powers (initially the identity matrix). [batch_size x N x N] Returns: Updated versions of each argument for one step of the unrolled computation. Does not change parts of the batch which have a residual power of zero. """ is_even = math_ops.equal(residual_powers % 2, array_ops.zeros( array_ops.shape(residual_powers), dtype=dtypes.int32)) new_accumulator = array_ops.where(is_even, accumulator, math_ops.matmul(accumulator, current_argument)) new_argument = math_ops.matmul(current_argument, current_argument) do_update = math_ops.greater(residual_powers, 1) new_residual_powers = residual_powers - residual_powers % 2 new_residual_powers //= 2 # Stop updating if we've reached our base case; some batch elements may # finish sooner than others accumulator = array_ops.where(do_update, new_accumulator, accumulator) current_argument = array_ops.where(do_update, new_argument, current_argument) residual_powers = array_ops.where(do_update, new_residual_powers, residual_powers) return (current_argument, residual_powers, accumulator) matrices = ops.convert_to_tensor(matrices) powers = math_ops.cast(powers, dtype=dtypes.int32) ident = array_ops.expand_dims( array_ops.diag( array_ops.ones([array_ops.shape(matrices)[1]], dtype=matrices.dtype)), 0) ident_tiled = array_ops.tile(ident, [array_ops.shape(matrices)[0], 1, 1]) (final_argument, final_residual_power, final_accumulator) = control_flow_ops.while_loop( terminate_when_all_zero, do_iteration, [matrices, powers, ident_tiled]) return array_ops.where( math_ops.equal(final_residual_power, array_ops.zeros_like( final_residual_power, dtype=dtypes.int32)), ident_tiled, math_ops.matmul(final_argument, final_accumulator))
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0. r = tf_utils.smart_cond(training, lambda: r, lambda: array_ops.ones_like(r)) d = tf_utils.smart_cond(training, lambda: d, lambda: array_ops.zeros_like(d)) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" value = array_ops.identity(value) def _do_update(): """Updates the var and weight, returns their updated ratio.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving the value. # Make sure the weight is not updated until before r and d computation. with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = self._assign_moving_average(var, value, self.renorm_momentum) new_weight = self._assign_moving_average( weight, weight_value, self.renorm_momentum) # TODO(yuefengz): the updates to var and weighted can not be batched # together if we fetch their updated values here. Consider calculating # new values and delaying the updates. return new_var / new_weight def _fake_update(): return array_ops.identity(var) return tf_utils.smart_cond(training, _do_update, _fake_update) # TODO(yuefengz): colocate the operations new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def call(self, inputs, training=None): training = self._get_training_value(training) if self.virtual_batch_size is not None: # Virtual batches (aka ghost batches) can be simulated by reshaping the # Tensor and reusing the existing batch norm implementation original_shape = [-1] + inputs.shape.as_list()[1:] expanded_shape = [self.virtual_batch_size, -1] + original_shape[1:] # Will cause errors if virtual_batch_size does not divide the batch size inputs = array_ops.reshape(inputs, expanded_shape) def undo_virtual_batching(outputs): outputs = array_ops.reshape(outputs, original_shape) return outputs if self.fused: outputs = self._fused_batch_norm(inputs, training=training) if self.virtual_batch_size is not None: # Currently never reaches here since fused_batch_norm does not support # virtual batching outputs = undo_virtual_batching(outputs) return outputs # Compute the axes along which to reduce the mean / variance input_shape = inputs.shape ndims = len(input_shape) reduction_axes = [i for i in range(ndims) if i not in self.axis] if self.virtual_batch_size is not None: del reduction_axes[1] # Do not reduce along virtual batch dim # Broadcasting only necessary for single-axis batch norm where the axis is # not the last dimension broadcast_shape = [1] * ndims broadcast_shape[self.axis[0]] = input_shape.dims[self.axis[0]].value def _broadcast(v): if (v is not None and len(v.shape) != ndims and reduction_axes != list(range(ndims - 1))): return array_ops.reshape(v, broadcast_shape) return v scale, offset = _broadcast(self.gamma), _broadcast(self.beta) def _compose_transforms(scale, offset, then_scale, then_offset): if then_scale is not None: scale *= then_scale offset *= then_scale if then_offset is not None: offset += then_offset return (scale, offset) # Determine a boolean value for `training`: could be True, False, or None. training_value = tf_utils.constant_value(training) if training_value == False: # pylint: disable=singleton-comparison,g-explicit-bool-comparison mean, variance = self.moving_mean, self.moving_variance else: if self.adjustment: adj_scale, adj_bias = self.adjustment(array_ops.shape(inputs)) # Adjust only during training. adj_scale = tf_utils.smart_cond(training, lambda: adj_scale, lambda: array_ops.ones_like(adj_scale)) adj_bias = tf_utils.smart_cond(training, lambda: adj_bias, lambda: array_ops.zeros_like(adj_bias)) scale, offset = _compose_transforms(adj_scale, adj_bias, scale, offset) # Some of the computations here are not necessary when training==False # but not a constant. However, this makes the code simpler. keep_dims = self.virtual_batch_size is not None or len(self.axis) > 1 mean, variance = self._moments( math_ops.cast(inputs, self._param_dtype), reduction_axes, keep_dims=keep_dims) moving_mean = self.moving_mean moving_variance = self.moving_variance mean = tf_utils.smart_cond(training, lambda: mean, lambda: ops.convert_to_tensor(moving_mean)) variance = tf_utils.smart_cond( training, lambda: variance, lambda: ops.convert_to_tensor(moving_variance)) if self.virtual_batch_size is not None: # This isn't strictly correct since in ghost batch norm, you are # supposed to sequentially update the moving_mean and moving_variance # with each sub-batch. However, since the moving statistics are only # used during evaluation, it is more efficient to just update in one # step and should not make a significant difference in the result. new_mean = math_ops.reduce_mean(mean, axis=1, keepdims=True) new_variance = math_ops.reduce_mean(variance, axis=1, keepdims=True) else: new_mean, new_variance = mean, variance if self._support_zero_size_input(): inputs_size = array_ops.size(inputs) else: inputs_size = None if self.renorm: r, d, new_mean, new_variance = self._renorm_correction_and_moments( new_mean, new_variance, training, inputs_size) # When training, the normalized values (say, x) will be transformed as # x * gamma + beta without renorm, and (x * r + d) * gamma + beta # = x * (r * gamma) + (d * gamma + beta) with renorm. r = _broadcast(array_ops.stop_gradient(r, name='renorm_r')) d = _broadcast(array_ops.stop_gradient(d, name='renorm_d')) scale, offset = _compose_transforms(r, d, scale, offset) def _do_update(var, value): """Compute the updates for mean and variance.""" return self._assign_moving_average(var, value, self.momentum, inputs_size) def mean_update(): true_branch = lambda: _do_update(self.moving_mean, new_mean) false_branch = lambda: self.moving_mean return tf_utils.smart_cond(training, true_branch, false_branch) def variance_update(): """Update the moving variance.""" def true_branch_renorm(): # We apply epsilon as part of the moving_stddev to mirror the training # code path. moving_stddev = _do_update(self.moving_stddev, math_ops.sqrt(new_variance + self.epsilon)) return self._assign_new_value( self.moving_variance, # Apply relu in case floating point rounding causes it to go # negative. K.relu(moving_stddev * moving_stddev - self.epsilon)) if self.renorm: true_branch = true_branch_renorm else: true_branch = lambda: _do_update(self.moving_variance, new_variance) false_branch = lambda: self.moving_variance return tf_utils.smart_cond(training, true_branch, false_branch) self.add_update(mean_update) self.add_update(variance_update) mean = math_ops.cast(mean, inputs.dtype) variance = math_ops.cast(variance, inputs.dtype) if offset is not None: offset = math_ops.cast(offset, inputs.dtype) if scale is not None: scale = math_ops.cast(scale, inputs.dtype) # TODO(reedwm): Maybe do math in float32 if given float16 inputs, if doing # math in float16 hurts validation accuracy of popular models like resnet. outputs = nn.batch_normalization(inputs, _broadcast(mean), _broadcast(variance), offset, scale, self.epsilon) # If some components of the shape got lost due to adjustments, fix that. outputs.set_shape(input_shape) if self.virtual_batch_size is not None: outputs = undo_virtual_batching(outputs) return outputs
def _zeros(self): return array_ops.zeros_like(self.a + self.b)
def _variance(self): return array_ops.zeros_like(self.loc)
def safe_embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights=None, combiner="mean", default_id=None, name="safe_embedding_lookup_sparse", partition_strategy=None, # no used max_norm=None, return_trainable=False, ): """Provides a dynamic version of `tf.nn.safe_embedding_lookup_sparse`. Lookup embedding results, accounting for empty features and invalid weights. Any IDs will be treated as valid include non-positive IDs. Invalid weights (<= 0) are pruned from input weights, as well as any IDs with non-positive weight. For an entry with no features, the embedding vector for `default_id` is returned, or the 0-vector if `default_id` is not supplied. The ids and weights may be multi-dimensional. Embeddings are always aggregated along the last dimension. Args: embedding_weights: A single `dynamic_embedding.Variable` instance representing the complete embedding tensor. sparse_ids: `SparseTensor` of shape `[d_0, d_1, ..., d_n]` containing the ids. `d_0` is typically batch size. sparse_weights: `SparseTensor` of same shape as `sparse_ids`, containing float weights corresponding to `sparse_ids`, or `None` if all weights are be assumed to be 1.0. combiner: A string specifying how to combine embedding results for each entry. Currently "mean", "sqrtn" and "sum" are supported, with "mean" the default. default_id: The id to use for an entry with no features. name: A name for this operation (optional). partition_strategy: A string specifying the partitioning strategy. Currently `"div"` and `"mod"` are supported. Default is `"div"`. max_norm: If not `None`, all embeddings are l2-normalized to max_norm before combining. Returns: combined_embeddings: A dense `Tensor` of shape `[d_0, d_1, ..., d_{n-1}, e_1, ..., e_m]`. trainable_wrap: A TrainableWrapper object used to fill the Optimizers `var_list` Only provided if `return_trainable` is True. Raises: ValueError: if `embedding_weights` is empty. """ if embedding_weights is None: raise ValueError("Missing embedding_weights %s." % embedding_weights) if embedding_weights.key_dtype != sparse_ids.dtype: raise TypeError( "embedding_weights.key_dtype should be same with sparse_ids.dtype: " "{} vs. {}".format(embedding_weights.key_dtype, sparse_ids.dtype)) weights_dtype = sparse_weights.dtype if sparse_weights is not None else None if weights_dtype and embedding_weights.value_dtype != weights_dtype: raise TypeError( "embedding_weights.value_dtype should be same with sparse_weights.dtype" ": {} vs. {}".format(embedding_weights.value_dtype, weights_dtype)) scope = variable_scope.get_variable_scope() full_name = scope.name + "/" + name if scope.name else name with ops.name_scope(full_name + "/"): # Reshape higher-rank sparse ids and weights to linear segment ids. original_shape = sparse_ids.dense_shape original_rank_dim = tensor_shape.dimension_value( sparse_ids.dense_shape.get_shape()[0]) original_rank = (array_ops.size(original_shape) if original_rank_dim is None else original_rank_dim) sparse_ids = sparse_ops.sparse_reshape( sparse_ids, [ math_ops.reduce_prod( array_ops.slice(original_shape, [0], [original_rank - 1])), array_ops.gather(original_shape, original_rank - 1), ], ) if sparse_weights is not None: sparse_weights = sparse_tensor.SparseTensor( sparse_ids.indices, sparse_weights.values, sparse_ids.dense_shape) # Prune invalid weights. if combiner != "sum": sparse_ids, sparse_weights = _prune_invalid_weights( sparse_ids, sparse_weights) # Fill in dummy values for empty features, if necessary. sparse_ids, is_row_empty = sparse_ops.sparse_fill_empty_rows( sparse_ids, default_id or 0) if sparse_weights is not None: sparse_weights, _ = sparse_ops.sparse_fill_empty_rows( sparse_weights, 1.0) result, trainable_ = embedding_lookup_sparse( embedding_weights, sparse_ids, sparse_weights, combiner=combiner, partition_strategy=partition_strategy, name=name + "/embedding_lookup_sparse", max_norm=max_norm, return_trainable=True, ) if default_id is None: # Broadcast is_row_empty to the same shape as embedding_lookup_result, # for use in Select. is_row_empty = array_ops.tile( array_ops.reshape(is_row_empty, [-1, 1]), array_ops.stack([1, array_ops.shape(result)[1]]), ) result = array_ops.where(is_row_empty, array_ops.zeros_like(result), result, name="where") # Reshape back from linear ids back into higher-dimensional dense result. final_result = array_ops.reshape( result, array_ops.concat( [ array_ops.slice( math_ops.cast(original_shape, dtypes.int32), [0], [original_rank - 1], ), array_ops.slice(array_ops.shape(result), [1], [-1]), ], 0, ), ) final_result.set_shape( tensor_shape.unknown_shape( (tensor_shape.Dimension(original_rank_dim) - 1).value).concatenate(result.get_shape()[1:])) return (final_result, trainable_) if return_trainable else final_result
def testNonDictMultiplierRaisesError(self): gradient = constant_op.constant(self._grad_vec, dtype=dtypes.float32) variable = variables_lib.Variable(array_ops.zeros_like(gradient)) grad_to_var = (gradient, variable) with self.assertRaises(ValueError): learning.multiply_gradients([grad_to_var], 3)
def _renorm_correction_and_moments(self, mean, variance, training): """Returns the correction and update values for renorm.""" stddev = math_ops.sqrt(variance + self.epsilon) # Compute the average mean and standard deviation, as if they were # initialized with this batch's moments. mixed_renorm_mean = (self.renorm_mean + (1. - self.renorm_mean_weight) * mean) mixed_renorm_stddev = (self.renorm_stddev + (1. - self.renorm_stddev_weight) * stddev) # Compute the corrections for batch renorm. r = stddev / mixed_renorm_stddev d = (mean - mixed_renorm_mean) / mixed_renorm_stddev # Ensure the corrections use pre-update moving averages. with ops.control_dependencies([r, d]): mean = array_ops.identity(mean) stddev = array_ops.identity(stddev) rmin, rmax, dmax = [ self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax'] ] if rmin is not None: r = math_ops.maximum(r, rmin) if rmax is not None: r = math_ops.minimum(r, rmax) if dmax is not None: d = math_ops.maximum(d, -dmax) d = math_ops.minimum(d, dmax) # When not training, use r=1, d=0, and decay=1 meaning no updates. r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r)) d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d)) decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.) def _update_renorm_variable(var, weight, value): """Updates a moving average and weight, returns the unbiased value.""" # Update the variables without zero debiasing. The debiasing will be # accomplished by dividing the exponential moving average by the weight. # For example, after a single update, the moving average would be # (1-decay) * value. and the weight will be 1-decay, with their ratio # giving value. # Make sure the weight is not updated until before r and d computation. value = array_ops.identity(value) with ops.control_dependencies([value]): weight_value = array_ops.constant(1., dtype=weight.dtype) new_var = moving_averages.assign_moving_average(var, value, decay, zero_debias=False) new_weight = moving_averages.assign_moving_average( weight, weight_value, decay, zero_debias=False) return new_var / new_weight with ops.colocate_with(self.moving_mean): new_mean = _update_renorm_variable(self.renorm_mean, self.renorm_mean_weight, mean) with ops.colocate_with(self.moving_variance): new_stddev = _update_renorm_variable(self.renorm_stddev, self.renorm_stddev_weight, stddev) # Make sqrt(moving_variance + epsilon) = new_stddev. new_variance = math_ops.square(new_stddev) - self.epsilon return (r, d, new_mean, new_variance)
def _SelectGrad(op, grad): c = op.inputs[0] x = op.inputs[1] zeros = array_ops.zeros_like(x) return (None, array_ops.where(c, grad, zeros), array_ops.where(c, zeros, grad))
def testZerosLikePartialShape(self): d = array_ops.placeholder(dtypes_lib.float32, shape=[None, 4, None]) z = array_ops.zeros_like(d) self.assertEqual(d.get_shape().as_list(), z.get_shape().as_list())