def _Update_global_variables(): local_vars = [v for g, v in grads_and_vars if g is not None] global_center_vars = [self._global_map[var] for var in local_vars] local_center_vars = [self._local_map[var] for var in local_vars] local_center_vars_update = [] for lvar, var in zip(local_center_vars, global_center_vars): local_center_vars_update.append(lvar.assign(var)) update_ops = [] differences = [] with ops.control_dependencies(local_center_vars_update): for v, lv in zip(local_vars, local_center_vars): with ops.device(v.device): differences.append(math_ops.subtract(v, lv)) for lvar, diff in zip(local_vars, differences): with ops.device(lvar.device): update_ops.append( state_ops.assign_sub(lvar, math_ops.multiply(self._moving_rate, diff))) for var, diff in zip(global_center_vars, differences): with ops.device(var.device): update_ops.append( state_ops.assign_add(var, math_ops.multiply(self._moving_rate, diff))) if global_step: with ops.colocate_with(global_step): update_ops.append(state_ops.assign_add(global_step, 1)) variable_update = control_flow_ops.group(*(update_ops)) return variable_update
def normalize_moments(counts, mean_ss, variance_ss, shift, name=None): """Calculate the mean and variance of based on the sufficient statistics. Args: counts: A `Tensor` containing a the total count of the data (one value). mean_ss: A `Tensor` containing the mean sufficient statistics: the (possibly shifted) sum of the elements to average over. variance_ss: A `Tensor` containing the variance sufficient statistics: the (possibly shifted) squared sum of the data to compute the variance over. shift: A `Tensor` containing the value by which the data is shifted for numerical stability, or `None` if no shift was performed. name: Name used to scope the operations that compute the moments. Returns: Two `Tensor` objects: `mean` and `variance`. """ with ops.name_scope(name, "normalize", [counts, mean_ss, variance_ss, shift]): divisor = math_ops.reciprocal(counts, name="divisor") if shift is not None: shifted_mean = math_ops.multiply(mean_ss, divisor, name="shifted_mean") mean = math_ops.add(shifted_mean, shift, name="mean") else: # no shift. shifted_mean = math_ops.multiply(mean_ss, divisor, name="mean") mean = shifted_mean variance = math_ops.subtract(math_ops.multiply(variance_ss, divisor), math_ops.square(shifted_mean), name="variance") return (mean, variance)
def Foo(): x = constant_op.constant(10.0, name="x") y = math_ops.multiply(x, c, name="y") # Regression test for b/122564611. z = math_ops.multiply(c, y, name="z") g = gradients_impl.gradients(z, x) return g[0]
def setUp(self): """Test setup. Structure of the forward graph: f | | ----- ----- | | d e | | | | --- --------- --- | | | a b c Construct a backward graph using the GradientDescentOptimizer. """ self.a = variables.Variable(1.0, name="a") self.b = variables.Variable(2.0, name="b") self.c = variables.Variable(4.0, name="c") self.d = math_ops.multiply(self.a, self.b, name="d") self.e = math_ops.multiply(self.b, self.c, name="e") self.f = math_ops.multiply(self.d, self.e, name="f") # Gradient descent optimizer that minimizes g. gradient_descent.GradientDescentOptimizer(0.01).minimize( self.f, name="optim") rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) config = config_pb2.ConfigProto(graph_options=graph_options) self.sess = session.Session(config=config) self.sess.run(variables.global_variables_initializer())
def logloss(y_true, y_pred): y_pred = ops.convert_to_tensor(y_pred) y_true = math_ops.cast(y_true, y_pred.dtype) losses = math_ops.multiply(y_true, math_ops.log(y_pred + K.epsilon())) losses += math_ops.multiply((1 - y_true), math_ops.log(1 - y_pred + K.epsilon())) return K.mean(-losses, axis=-1)
def decayed_lr(learning_rate, global_step, decay_steps, end_learning_rate, power, cycle, name): """Helper to recompute learning rate; most helpful in eager-mode.""" with ops.name_scope( name, "PolynomialDecay", [learning_rate, global_step, decay_steps, end_learning_rate, power] ) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype end_learning_rate = math_ops.cast(end_learning_rate, dtype) power = math_ops.cast(power, dtype) global_step_recomp = math_ops.cast(global_step, dtype) decay_steps_recomp = math_ops.cast(decay_steps, dtype) if cycle: # Find the first multiple of decay_steps that is bigger than # global_step. If global_step is zero set the multiplier to 1 multiplier = control_flow_ops.cond( math_ops.equal(global_step_recomp, 0), lambda: 1.0, lambda: math_ops.ceil(global_step_recomp / decay_steps)) decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier) else: # Make sure that the global_step used is not bigger than decay_steps. global_step_recomp = math_ops.minimum(global_step_recomp, decay_steps) p = math_ops.div(global_step_recomp, decay_steps_recomp) return math_ops.add( math_ops.multiply(learning_rate - end_learning_rate, math_ops.pow(1 - p, power)), end_learning_rate, name=name)
def __call__(self, step): with ops.name_scope( self.name, "PolynomialDecay", [self.initial_learning_rate, step, self.decay_steps, self.end_learning_rate, self.power] ) as name: initial_learning_rate = ops.convert_to_tensor( self.initial_learning_rate, name="initial_learning_rate") dtype = initial_learning_rate.dtype end_learning_rate = math_ops.cast(self.end_learning_rate, dtype) power = math_ops.cast(self.power, dtype) global_step_recomp = math_ops.cast(step, dtype) decay_steps_recomp = math_ops.cast(self.decay_steps, dtype) if self.cycle: # Find the first multiple of decay_steps that is bigger than # global_step. If global_step is zero set the multiplier to 1 multiplier = control_flow_ops.cond( math_ops.equal(global_step_recomp, 0), lambda: 1.0, lambda: math_ops.ceil(global_step_recomp / self.decay_steps)) decay_steps_recomp = math_ops.multiply(decay_steps_recomp, multiplier) else: # Make sure that the global_step used is not bigger than decay_steps. global_step_recomp = math_ops.minimum(global_step_recomp, self.decay_steps) p = math_ops.div(global_step_recomp, decay_steps_recomp) return math_ops.add( math_ops.multiply(initial_learning_rate - end_learning_rate, math_ops.pow(1 - p, power)), end_learning_rate, name=name)
def test_graph_replace_gradients(self): ops.reset_default_graph() w = variables.VariableV1(0.0, name="w") y = math_ops.multiply(math_ops.multiply(w, w, name="mul1"), w, name="mul2") g = gradients_impl.gradients(y, w, name="grad")[0] # Extract the operations. replacement_ts = {w.value(): g} original_mul1_grad = (ops.get_default_graph(). get_operation_by_name("grad/mul1_grad/Mul_1")) # Should not raise exception. res = ge.graph_replace(g, replacement_ts, dst_scope="res") # Extract the operations after graph_replace. result_mul1_grad = (ops.get_default_graph(). get_operation_by_name("res/grad/mul1_grad/Mul_1")) # Make sure _original_ops are as expected. self.assertEqual(original_mul1_grad._original_op.name, u"mul1") self.assertEqual(result_mul1_grad._original_op.name, u"res/mul1") self.assertNotEqual(res.name, g.name) with session.Session() as sess: sess.run(variables.global_variables_initializer()) g_val, res_val = sess.run([g, res]) self.assertNear(g_val, 0.0, ERROR_TOLERANCE) self.assertNear(res_val, 0.0, ERROR_TOLERANCE)
def accuracy(predictions, labels, weights=None): """Computes the percentage of times that predictions matches labels. Args: predictions: the predicted values, a `Tensor` whose dtype and shape matches 'labels'. labels: the ground truth values, a `Tensor` of any shape and bool, integer, or string dtype. weights: None or `Tensor` of float values to reweight the accuracy. Returns: Accuracy `Tensor`. Raises: ValueError: if dtypes don't match or if dtype is not bool, integer, or string. """ if not (labels.dtype.is_integer or labels.dtype in (dtypes.bool, dtypes.string)): raise ValueError( 'Labels should have bool, integer, or string dtype, not %r' % labels.dtype) if not labels.dtype.is_compatible_with(predictions.dtype): raise ValueError('Dtypes of predictions and labels should match. ' 'Given: predictions (%r) and labels (%r)' % (predictions.dtype, labels.dtype)) with ops.name_scope('accuracy', values=[predictions, labels]): is_correct = math_ops.cast( math_ops.equal(predictions, labels), dtypes.float32) if weights is not None: is_correct = math_ops.multiply(is_correct, weights) num_values = math_ops.multiply(weights, array_ops.ones_like(is_correct)) return math_ops.div(math_ops.reduce_sum(is_correct), math_ops.reduce_sum(num_values)) return math_ops.reduce_mean(is_correct)
def setUp(self): self.a = variables.VariableV1(2.0, name="a") self.b = variables.VariableV1(3.0, name="b") self.c = math_ops.multiply(self.a, self.b, name="c") # Should be 6.0. self.d = math_ops.multiply(self.a, self.a, name="d") # Should be 4.0. self.e = math_ops.multiply(self.d, self.c, name="e") # Should be 24.0. self.f_y = constant_op.constant(0.30, name="f_y") self.f = math_ops.div(self.b, self.f_y, name="f") # Should be 10.0. # The there nodes x, y and z form a graph with "cross-links" in. I.e., x # and y are both direct inputs to z, but x is also a direct input to y. self.x = variables.VariableV1(2.0, name="x") # Should be 2.0 self.y = math_ops.negative(self.x, name="y") # Should be -2.0. self.z = math_ops.multiply(self.x, self.y, name="z") # Should be -4.0. rewriter_config = rewriter_config_pb2.RewriterConfig( disable_model_pruning=True, arithmetic_optimization=rewriter_config_pb2.RewriterConfig.OFF, constant_folding=rewriter_config_pb2.RewriterConfig.OFF) graph_options = config_pb2.GraphOptions(rewrite_options=rewriter_config) config = config_pb2.ConfigProto(graph_options=graph_options) self.sess = session.Session(config=config) self.sess.run(variables.global_variables_initializer())
def log_loss(predictions, labels=None, weights=1.0, epsilon=1e-7, scope=None): """Adds a Log Loss term to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size [batch_size], then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: predictions: The predicted outputs. labels: The ground truth output tensor, same dimensions as 'predictions'. weights: Coefficients for the loss a scalar, a tensor of shape [batch_size] or a tensor whose shape matches `predictions`. epsilon: A small increment to add to avoid taking a log of zero. scope: The scope for the operations performed in computing the loss. Returns: A scalar `Tensor` representing the loss value. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. """ with ops.name_scope(scope, "log_loss", [predictions, labels, weights]) as scope: predictions.get_shape().assert_is_compatible_with(labels.get_shape()) predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) losses = -math_ops.multiply( labels, math_ops.log(predictions + epsilon)) - math_ops.multiply( (1 - labels), math_ops.log(1 - predictions + epsilon)) return compute_weighted_loss(losses, weights, scope=scope)
def _SquareGrad(op, grad): x = op.inputs[0] # Added control dependencies to prevent 2*x from being computed too early. with ops.control_dependencies([grad]): x = math_ops.conj(x) y = constant_op.constant(2.0, dtype=x.dtype) return math_ops.multiply(grad, math_ops.multiply(x, y))
def setUp(self): """Test setup. Structure of the forward graph: f | | ----- ----- | | d e | | | | --- --------- --- | | | a b c Construct a backward graph using the GradientDescentOptimizer. """ self.a = variables.Variable(1.0, name="a") self.b = variables.Variable(2.0, name="b") self.c = variables.Variable(4.0, name="c") self.d = math_ops.multiply(self.a, self.b, name="d") self.e = math_ops.multiply(self.b, self.c, name="e") self.f = math_ops.multiply(self.d, self.e, name="f") # Gradient descent optimizer that minimizes g. gradient_descent.GradientDescentOptimizer(0.01).minimize( self.f, name="optim") self.sess = session.Session() self.sess.run(variables.global_variables_initializer())
def huber_loss(y_true, y_pred, delta=1.0): """Computes Huber loss value. For each value x in `error=y_true-y_pred`, the following is calculated: ``` 0.5 * x^2 if |x| <= d 0.5 * d^2 + d * (|x| - d) if |x| > d ``` where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss Args: y_true: tensor of true targets. y_pred: tensor of predicted targets. delta: A float, the point where the Huber loss function changes from a quadratic to linear. Returns: Tensor with one scalar loss entry per sample. """ y_pred = math_ops.cast(y_pred, dtype=K.floatx()) y_true = math_ops.cast(y_true, dtype=K.floatx()) error = math_ops.subtract(y_pred, y_true) abs_error = math_ops.abs(error) quadratic = math_ops.minimum(abs_error, delta) linear = math_ops.subtract(abs_error, quadratic) return math_ops.add( math_ops.multiply( ops.convert_to_tensor(0.5, dtype=quadratic.dtype), math_ops.multiply(quadratic, quadratic)), math_ops.multiply(delta, linear))
def unregularized_loss(self, examples): """Add operations to compute the loss (without the regularization loss). Args: examples: Examples to compute unregularized loss on. Returns: An Operation that computes mean (unregularized) loss for given set of examples. Raises: ValueError: if examples are not well defined. """ self._assertSpecified([ 'example_labels', 'example_weights', 'sparse_features', 'dense_features' ], examples) self._assertList(['sparse_features', 'dense_features'], examples) with name_scope('sdca/unregularized_loss'): predictions = math_ops.cast( self._linear_predictions(examples), dtypes.float64) labels = math_ops.cast( internal_convert_to_tensor(examples['example_labels']), dtypes.float64) weights = math_ops.cast( internal_convert_to_tensor(examples['example_weights']), dtypes.float64) if self._options['loss_type'] == 'logistic_loss': return math_ops.reduce_sum(math_ops.multiply( sigmoid_cross_entropy_with_logits(labels=labels, logits=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] == 'poisson_loss': return math_ops.reduce_sum(math_ops.multiply( log_poisson_loss(targets=labels, log_input=predictions), weights)) / math_ops.reduce_sum(weights) if self._options['loss_type'] in ['hinge_loss', 'smooth_hinge_loss']: # hinge_loss = max{0, 1 - y_i w*x} where y_i \in {-1, 1}. So, we need to # first convert 0/1 labels into -1/1 labels. all_ones = array_ops.ones_like(predictions) adjusted_labels = math_ops.subtract(2 * labels, all_ones) # Tensor that contains (unweighted) error (hinge loss) per # example. error = nn_ops.relu( math_ops.subtract(all_ones, math_ops.multiply(adjusted_labels, predictions))) weighted_error = math_ops.multiply(error, weights) return math_ops.reduce_sum(weighted_error) / math_ops.reduce_sum( weights) # squared loss err = math_ops.subtract(labels, predictions) weighted_squared_err = math_ops.multiply(math_ops.square(err), weights) # SDCA squared loss function is sum(err^2) / (2*sum(weights)) return (math_ops.reduce_sum(weighted_squared_err) / (2.0 * math_ops.reduce_sum(weights)))
def testSmartCondTrue(self): with ops.Graph().as_default(): with session.Session(): x = constant_op.constant(2) y = constant_op.constant(5) z = smart_cond.smart_cond(True, lambda: math_ops.multiply(x, 16), lambda: math_ops.multiply(y, 5)) self.assertEqual(z.eval(), 32)
def testSmartCondFalse(self): with ops.Graph().as_default(): with session.Session(): x = constant_op.constant(4) y = constant_op.constant(3) z = smart_cond.smart_cond(False, lambda: math_ops.multiply(x, 16), lambda: math_ops.multiply(y, 3)) self.assertEqual(z.eval(), 9)
def fn(): two = constant_op.constant(2.0, name='two') ten = constant_op.constant(10.0, name='ten') twenty = math_ops.multiply(two, ten, name='twenty') three = constant_op.constant(3.0, name='three') with framework_ops.colocate_with(twenty): thirty = math_ops.multiply(three, ten, name='thirty') return ten, twenty, thirty
def decayed_lr(): """Helper to recompute learning rate; most helpful in eager-mode.""" global_step_recomp = math_ops.cast(global_step, dtype) p = global_step_recomp / decay_steps if staircase: p = math_ops.floor(p) exponent = math_ops.exp( math_ops.multiply(math_ops.negative(decay_rate), p)) return math_ops.multiply(learning_rate, exponent, name=name)
def testFeedOneHandleDirectly(self): with self.test_session() as sess: a = constant_op.constant(10.0) b = constant_op.constant(5.0) c = math_ops.multiply(a, b) d = math_ops.multiply(c, c) h_c = sess.run(session_ops.get_session_handle(c)) self.assertAllClose(2500.0, sess.run(d, feed_dict={c: h_c}))
def testScan_Simple(self): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") # pylint: disable=unnecessary-lambda r = functional_ops.scan(lambda a, x: math_ops.multiply(a, x), elems) self.assertAllEqual([1., 2., 6., 24., 120., 720.], self.evaluate(r)) r = functional_ops.scan( lambda a, x: math_ops.multiply(a, x), elems, initializer=v) self.assertAllEqual([2., 4., 12., 48., 240., 1440.], self.evaluate(r))
def testHandleAndValue(self): with self.test_session() as sess: # Return a handle and a value. a = constant_op.constant(10) b = constant_op.constant(5) c = math_ops.multiply(a, b) h = session_ops.get_session_handle(c) v = math_ops.multiply(a, c) h, v = sess.run([h, v]) self.assertEqual(50, h.eval()) self.assertEqual(500, v)
def log_loss(labels, predictions, weights=1.0, epsilon=1e-7, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a Log Loss term to the training procedure. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: labels: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). epsilon: A small increment to add to avoid taking a log of zero. scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "log_loss", (predictions, labels, weights)) as scope: predictions = math_ops.to_float(predictions) labels = math_ops.to_float(labels) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) losses = -math_ops.multiply( labels, math_ops.log(predictions + epsilon)) - math_ops.multiply( (1 - labels), math_ops.log(1 - predictions + epsilon)) return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def testFold_Grad(self): with self.cached_session(): elems = constant_op.constant([1.0, 2.0, 3.0, 4.0, 5.0, 6.0], name="data") v = constant_op.constant(2.0, name="v") r = functional_ops.foldl( lambda a, x: math_ops.multiply(a, x), elems, initializer=v) r = gradients_impl.gradients(r, v)[0] self.assertAllEqual(720.0, self.evaluate(r)) r = functional_ops.foldr( lambda a, x: math_ops.multiply(a, x), elems, initializer=v) r = gradients_impl.gradients(r, v)[0] self.assertAllEqual(720.0, self.evaluate(r))
def testFoldl_Simple(self): elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data") r = functional_ops.foldl( lambda a, x: math_ops.multiply(math_ops.add(a, x), 2), elems) self.assertAllEqual(208, self.evaluate(r)) r = functional_ops.foldl( lambda a, x: math_ops.multiply(math_ops.add(a, x), 2), elems, initializer=10) self.assertAllEqual(880, self.evaluate(r))
def testHandleBasic(self): with self.test_session() as sess: # Return a handle. a = constant_op.constant(10) b = constant_op.constant(5) c = math_ops.multiply(a, b) h = session_ops.get_session_handle(c) h = sess.run(h) # Feed a tensor handle. f, x = session_ops.get_session_tensor(h.handle, dtypes.int32) y = math_ops.multiply(x, 10) self.assertEqual(500, sess.run(y, feed_dict={f: h.handle}))
def _num_present(losses, weights, per_batch=False): """Computes the number of elements in the loss function induced by `weights`. A given weights tensor induces different numbers of usable elements in the `losses` tensor. The `weights` tensor is broadcast across `losses` for all possible dimensions. For example, if `losses` is a tensor of dimension `[4, 5, 6, 3]` and `weights` is a tensor of shape `[4, 5]`, then `weights` is, in effect, tiled to match the shape of `losses`. Following this effective tile, the total number of present elements is the number of non-zero weights. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. weights: `Tensor` of shape `[]`, `[batch_size]` or `[batch_size, d1, ... dK]`, where K < N. per_batch: Whether to return the number of elements per batch or as a sum total. Returns: The number of present (non-zero) elements in the losses tensor. If `per_batch` is `True`, the value is returned as a tensor of size `[batch_size]`. Otherwise, a single scalar tensor is returned. """ # If weights is a scalar, its easy to compute: if weights.get_shape().ndims == 0: if losses.get_shape().ndims == 0: batch_size = 1 else: batch_size = array_ops.reshape(array_ops.slice(array_ops.shape(losses), [0], [1]), []) num_per_batch = math_ops.div(math_ops.to_float(array_ops.size(losses)), math_ops.to_float(batch_size)) num_per_batch = array_ops.where(math_ops.equal(weights, 0), 0.0, num_per_batch) num_per_batch = math_ops.multiply(array_ops.ones( array_ops.reshape(batch_size, [1])), num_per_batch) return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch) # First, count the number of nonzero weights. if weights.get_shape().ndims >= 1: reduction_indices = list(range(1, weights.get_shape().ndims)) num_nonzero_per_batch = math_ops.reduce_sum( math_ops.to_float(math_ops.not_equal(weights, 0)), reduction_indices=reduction_indices) # Next, determine the number of elements that weight would broadcast to: broadcast_dims = array_ops.slice(array_ops.shape(losses), [weights.get_shape().ndims], [-1]) num_to_broadcast = math_ops.to_float(math_ops.reduce_prod(broadcast_dims)) num_per_batch = math_ops.multiply(num_nonzero_per_batch, num_to_broadcast) return num_per_batch if per_batch else math_ops.reduce_sum(num_per_batch)
def testTrainingLoop(self): with imperative_mode.ImperativeMode(self._target) as mode: w = variables.Variable(np.random.rand(3)) x = constant_op.constant(np.random.rand(3)) y = math_ops.multiply(x, w) dw = gradients_impl.gradients(y, w) self.assertAllClose(dw[0].value, x.value) for _ in range(3): with mode.new_step(): x = constant_op.constant(np.random.rand(3)) y = math_ops.multiply(x, w) dw = gradients_impl.gradients(y, w) self.assertAllClose(dw[0].value, x.value)
def testFoldr_Simple(self): with self.test_session(): elems = constant_op.constant([1, 2, 3, 4, 5, 6], name="data") r = functional_ops.foldr( lambda a, x: math_ops.multiply(math_ops.add(a, x), 2), elems) self.assertAllEqual(450, r.eval()) r = functional_ops.foldr( lambda a, x: math_ops.multiply(math_ops.add(a, x), 2), elems, initializer=10) self.assertAllEqual(1282, r.eval())
def _update_mask(self, weights, threshold): """Updates the mask for a given weight tensor. This functions first computes the cdf of the weight tensor, and estimates the threshold value such that 'desired_sparsity' fraction of weights have magnitude less than the threshold. Args: weights: The weight tensor that needs to be masked. threshold: The current threshold value. The function will compute a new threshold and return the exponential moving average using the current value of threshold Returns: new_threshold: The new value of the threshold based on weights, and sparsity at the current global_step new_mask: A numpy array of the same size and shape as weights containing 0 or 1 to indicate which of the values in weights falls below the threshold Raises: ValueError: if sparsity is not defined """ if self._sparsity is None: raise ValueError('Sparsity variable undefined') sparsity = self._get_sparsity(weights.op.name) with ops.name_scope(weights.op.name + '_pruning_ops'): abs_weights = math_ops.abs(weights) max_value = math_ops.reduce_max(abs_weights) cdf_fn = pruning_utils.compute_cdf_from_histogram if self._spec.use_tpu: cdf_fn = pruning_utils.compute_cdf norm_cdf = cdf_fn(abs_weights, [0.0, max_value], nbins=self._spec.nbins) current_threshold = math_ops.multiply( math_ops.div( math_ops.reduce_sum( math_ops.cast( math_ops.less(norm_cdf, sparsity), dtypes.float32)), float(self._spec.nbins)), max_value) smoothed_threshold = math_ops.add_n([ math_ops.multiply(current_threshold, 1 - self._spec.threshold_decay), math_ops.multiply(threshold, self._spec.threshold_decay) ]) new_mask = math_ops.cast( math_ops.greater(abs_weights, smoothed_threshold), dtypes.float32) return smoothed_threshold, new_mask
def compute_weighted_loss( losses, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Computes the weighted loss. Args: losses: `Tensor` of shape `[batch_size, d1, ... dN]`. weights: Optional `Tensor` whose rank is either 0, or the same rank as `losses`, and must be broadcastable to `losses` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). scope: the scope for the operations performed in computing the loss. loss_collection: the loss will be added to these collections. reduction: Type of reduction to apply to loss. Returns: Weighted loss `Tensor` of the same type as `losses`. If `reduction` is `NONE`, this has the same shape as `losses`; otherwise, it is scalar. Raises: ValueError: If `weights` is `None` or the shape is not compatible with `losses`, or if the number of dimensions (rank) of either `losses` or `weights` is missing. Note: When calculating the gradient of a weighted loss contributions from both `losses` and `weights` are considered. If your `weights` depend on some model parameters but you do not want this to affect the loss gradient, you need to apply `tf.stop_gradient` to `weights` before passing them to `compute_weighted_loss`. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ Reduction.validate(reduction) with ops.name_scope(scope, "weighted_loss", (losses, weights)): # Save the `reduction` argument for loss normalization when distributing # to multiple replicas. Used only for estimator + v1 optimizer flow. ops.get_default_graph()._last_loss_reduction = reduction # pylint: disable=protected-access with ops.control_dependencies(( weights_broadcast_ops.assert_broadcastable(weights, losses),)): losses = ops.convert_to_tensor(losses) input_dtype = losses.dtype losses = math_ops.cast(losses, dtype=dtypes.float32) weights = math_ops.cast(weights, dtype=dtypes.float32) weighted_losses = math_ops.multiply(losses, weights) if reduction == Reduction.NONE: loss = weighted_losses else: loss = math_ops.reduce_sum(weighted_losses) if reduction == Reduction.MEAN: loss = _safe_mean( loss, math_ops.reduce_sum(array_ops.ones_like(losses) * weights)) elif (reduction == Reduction.SUM_BY_NONZERO_WEIGHTS or reduction == Reduction.SUM_OVER_NONZERO_WEIGHTS): loss = _safe_mean(loss, _num_present(losses, weights)) elif reduction == Reduction.SUM_OVER_BATCH_SIZE: loss = _safe_mean(loss, _num_elements(losses)) # Convert the result back to the input type. loss = math_ops.cast(loss, input_dtype) util.add_loss(loss, loss_collection) return loss
def triplet_semihard_loss(labels, embeddings, margin=1.0): """Computes the triplet loss with semi-hard negative mining. The loss encourages the positive distances (between a pair of embeddings with the same labels) to be smaller than the minimum negative distance among which are at least greater than the positive distance plus the margin constant (called semi-hard negative) in the mini-batch. If no such negative exists, uses the largest negative distance instead. See: https://arxiv.org/abs/1503.03832. Args: labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels. embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should be l2 normalized. margin: Float, margin term in the loss definition. Returns: triplet_loss: tf.float32 scalar. """ # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) # Build pairwise squared distance matrix. pdist_matrix = pairwise_distance(embeddings, squared=True) # Build pairwise binary adjacency matrix. adjacency = math_ops.equal(labels, array_ops.transpose(labels)) # Invert so we can select negatives only. adjacency_not = math_ops.logical_not(adjacency) batch_size = array_ops.size(labels) # Compute the mask. pdist_matrix_tile = array_ops.tile(pdist_matrix, [batch_size, 1]) mask = math_ops.logical_and( array_ops.tile(adjacency_not, [batch_size, 1]), math_ops.greater( pdist_matrix_tile, array_ops.reshape(array_ops.transpose(pdist_matrix), [-1, 1]))) mask_final = array_ops.reshape( math_ops.greater( math_ops.reduce_sum(math_ops.cast(mask, dtype=dtypes.float32), 1, keep_dims=True), 0.0), [batch_size, batch_size]) mask_final = array_ops.transpose(mask_final) adjacency_not = math_ops.cast(adjacency_not, dtype=dtypes.float32) mask = math_ops.cast(mask, dtype=dtypes.float32) # negatives_outside: smallest D_an where D_an > D_ap. negatives_outside = array_ops.reshape( masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size]) negatives_outside = array_ops.transpose(negatives_outside) # negatives_inside: largest D_an. negatives_inside = array_ops.tile( masked_maximum(pdist_matrix, adjacency_not), [1, batch_size]) semi_hard_negatives = array_ops.where(mask_final, negatives_outside, negatives_inside) loss_mat = math_ops.add(margin, pdist_matrix - semi_hard_negatives) mask_positives = math_ops.cast(adjacency, dtype=dtypes.float32) - array_ops.diag( array_ops.ones([batch_size])) # In lifted-struct, the authors multiply 0.5 for upper triangular # in semihard, they take all positive pairs except the diagonal. num_positives = math_ops.reduce_sum(mask_positives) _triplet_loss = math_ops.truediv(math_ops.reduce_sum( math_ops.maximum(math_ops.multiply(loss_mat, mask_positives), 0.0)), num_positives, name='triplet_semihard_loss') return _triplet_loss
def logloss(y_true, y_pred, epsilon=1e-7): losses = math_ops.multiply(y_true, math_ops.log(y_pred + epsilon)) losses += math_ops.multiply((1 - y_true), math_ops.log(1 - y_pred + epsilon)) return K.mean(-losses, axis=-1)
def _double_values(x): custom = op_hint.OpHint("add_test") x, = custom.add_inputs(x) output = math_ops.multiply(x, x) output, = custom.add_outputs(output) return output
def npairs_loss_multilabel(sparse_labels, embeddings_anchor, embeddings_positive, reg_lambda=0.002, print_losses=False): r"""Computes the npairs loss with multilabel data. Npairs loss expects paired data where a pair is composed of samples from the same labels and each pairs in the minibatch have different labels. The loss has two components. The first component is the L2 regularizer on the embedding vectors. The second component is the sum of cross entropy loss which takes each row of the pair-wise similarity matrix as logits and the remapped one-hot labels as labels. Here, the similarity is defined by the dot product between two embedding vectors. S_{i,j} = f(x_i)^T f(x_j) To deal with multilabel inputs, we use the count of label intersection i.e. L_{i,j} = | set_of_labels_for(i) \cap set_of_labels_for(j) | Then we normalize each rows of the count based label matrix so that each row sums to one. Args: sparse_labels: List of 1-D Boolean `SparseTensor` of dense_shape [batch_size/2, num_classes] labels for the anchor-pos pairs. embeddings_anchor: 2-D `Tensor` of shape [batch_size/2, embedding_dim] for the embedding vectors for the anchor images. Embeddings should not be l2 normalized. embeddings_positive: 2-D `Tensor` of shape [batch_size/2, embedding_dim] for the embedding vectors for the positive images. Embeddings should not be l2 normalized. reg_lambda: Float. L2 regularization term on the embedding vectors. print_losses: Boolean. Option to print the xent and l2loss. Returns: npairs_loss: tf.float32 scalar. Raises: TypeError: When the specified sparse_labels is not a `SparseTensor`. """ if False in [isinstance( l, sparse_tensor.SparseTensor) for l in sparse_labels]: raise TypeError( 'sparse_labels must be a list of SparseTensors, but got %s' % str( sparse_labels)) with ops.name_scope('NpairsLossMultiLabel'): # Add the regularizer on the embedding. reg_anchor = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_anchor), 1)) reg_positive = math_ops.reduce_mean( math_ops.reduce_sum(math_ops.square(embeddings_positive), 1)) l2loss = math_ops.multiply(0.25 * reg_lambda, reg_anchor + reg_positive, name='l2loss') # Get per pair similarities. similarity_matrix = math_ops.matmul( embeddings_anchor, embeddings_positive, transpose_a=False, transpose_b=True) # TODO(coreylynch): need to check the sparse values # TODO(coreylynch): are composed only of 0's and 1's. multilabel_adjacency_matrix = _build_multilabel_adjacency(sparse_labels) labels_remapped = math_ops.to_float(multilabel_adjacency_matrix) labels_remapped /= math_ops.reduce_sum(labels_remapped, 1, keep_dims=True) # Add the softmax loss. xent_loss = nn.softmax_cross_entropy_with_logits( logits=similarity_matrix, labels=labels_remapped) xent_loss = math_ops.reduce_mean(xent_loss, name='xentropy') if print_losses: xent_loss = logging_ops.Print( xent_loss, ['cross entropy:', xent_loss, 'l2loss:', l2loss]) return l2loss + xent_loss
def testStripUnusedMultipleInputs(self): input_graph_name = "input_graph.pb" output_graph_name = "output_graph.pb" # We'll create an input graph that multiplies two input nodes. with ops.Graph().as_default(): constant_node1 = constant_op.constant(1.0, name="constant_node1") constant_node2 = constant_op.constant(2.0, name="constant_node2") input_node1 = math_ops.sub(constant_node1, 3.0, name="input_node1") input_node2 = math_ops.sub(constant_node2, 5.0, name="input_node2") output_node = math_ops.multiply(input_node1, input_node2, name="output_node") math_ops.add(output_node, 2.0, name="later_node") sess = session.Session() output = sess.run(output_node) self.assertNear(6.0, output, 0.00001) graph_io.write_graph(sess.graph, self.get_temp_dir(), input_graph_name) # We save out the graph to disk, and then call the const conversion # routine. input_graph_path = os.path.join(self.get_temp_dir(), input_graph_name) input_binary = False input_node_names = "input_node1,input_node2" input_node_types = [ dtypes.float32.as_datatype_enum, dtypes.float32.as_datatype_enum ] output_binary = True output_node_names = "output_node" output_graph_path = os.path.join(self.get_temp_dir(), output_graph_name) strip_unused_lib.strip_unused_from_files( input_graph_path, input_binary, output_graph_path, output_binary, input_node_names, output_node_names, input_node_types) # Now we make sure the variable is now a constant, and that the graph still # produces the expected result. with ops.Graph().as_default(): output_graph_def = graph_pb2.GraphDef() with open(output_graph_path, "rb") as f: output_graph_def.ParseFromString(f.read()) _ = importer.import_graph_def(output_graph_def, name="") self.assertEqual(3, len(output_graph_def.node)) for node in output_graph_def.node: self.assertNotEqual("Add", node.op) self.assertNotEqual("Sub", node.op) if node.name == input_node_names: self.assertTrue("shape" in node.attr) with session.Session() as sess: input_node1 = sess.graph.get_tensor_by_name("input_node1:0") input_node2 = sess.graph.get_tensor_by_name("input_node2:0") output_node = sess.graph.get_tensor_by_name("output_node:0") output = sess.run(output_node, feed_dict={ input_node1: [10.0], input_node2: [-5.0] }) self.assertNear(-50.0, output, 0.00001)
def _einsum_reduction(t0, t0_axis_labels, t1, t1_axis_labels, axes_to_sum): """Helper for einsum() that computes the result of a two-argument einsum(). Args: t0: a `Tensor` t0_axis_labels: a string of axis labels. This string's length must equal the rank of t0. t1: a `Tensor` t1_axis_labels: a string to axis labels. This string's length must equal the rank of t1. axes_to_sum: set of labels of axes to be summed over Returns: A `Tensor` whose elements are obtained by summing, over all axes in `axes_to_sum`, the corresponding elements of `t0` and `t1`. For example, if t0_axis_labels == 'abijk', t1_axis_labels == 'acjkl', and axes_to_sum == {j,k}, this will return a tensor x where out[a,b,c,i,l] = sum_j sum_k t0[a,b,i,j,k] * t1[a,c,j,k,l] Raises: ValueError: if the rank of `t0` does not match the length of `t0_axis_labels`, or that of `t1` does not match the length of `t1_axis_labels`. """ if len(t0_axis_labels) != len(t0.get_shape()): raise ValueError() if len(t1_axis_labels) != len(t1.get_shape()): raise ValueError() # This function computes the result of a two-argument einsum() using batch # matrix multiplication. This involves # 1. transposing t0 and t1 so that axes are in the correct order for # batch matrix multiplication, and # 2. reshaping t0 and t1 so that they are both of rank 3. # First, we divide axes into three groups: # * "preserved" axes are present in both inputs and the output # * "summed" axes are present in both inputs but not the output # * "broadcast" axes are present in exactly one input and the output # # As an example, if the einsum is abijk,acjkl->abcil, then "a" is a # preserved axis, "b" and "c" are broadcast axes, and "j" and "k" are # summed axes. assert all(a in t0_axis_labels and a in t1_axis_labels for a in axes_to_sum) preserved_axes = (set(t0_axis_labels) & set(t1_axis_labels)) - axes_to_sum broadcast_axes = {} for i, sym_list in enumerate([t0_axis_labels, t1_axis_labels]): broadcast_axes[i] = set(sym_list) - preserved_axes - axes_to_sum # Reorder the axes so that: # 1. preserved axes come first in both inputs # 2. in input 0, broadcast axes come next, followed by summed axes # 3. in input 1, summed axes come next, followed by broadcast axes def sort_key(input_index, a): if a in preserved_axes: return (-1, a) elif ((input_index == 0 and a in broadcast_axes[0]) or (input_index == 1 and a in axes_to_sum)): return (0, a) else: return (1, a) axis_labels = [t0_axis_labels, t1_axis_labels] sorted_axes = [ sorted(sym_list, key=lambda a: sort_key(i, a)) for i, sym_list in enumerate(axis_labels) ] inputs = [t0, t1] for i, axes_str in enumerate(axis_labels): perm = [axes_str.find(a) for a in sorted_axes[i]] inputs[i] = _transpose_if_necessary(inputs[i], perm) t0, t1 = inputs if not axes_to_sum: # In the special case where there are no axes to sum over, reduce to mul() # rather than to batch matrix multiplication. for _ in broadcast_axes[1]: t0 = array_ops.expand_dims(t0, -1) for _ in broadcast_axes[0]: t1 = array_ops.expand_dims(t1, len(preserved_axes)) product = math_ops.multiply(t0, t1) product_axes = sorted_axes[0] + sorted_axes[1][len(preserved_axes):] return product, ''.join(product_axes) else: # Reduce to matmul(). # Reshape both inputs so as to combine multiple broadcast axes # into a single axis, and combine multiple summed axes into a # single axis. t0_shape = tuple(x.value for x in t0.get_shape()) num_broadcast_elements_t0 = _total_size( t0_shape[len(preserved_axes):-len(axes_to_sum)]) num_summed_elements = _total_size(t0_shape[-len(axes_to_sum):]) new_shape = t0_shape[:len(preserved_axes)] + ( num_broadcast_elements_t0, num_summed_elements) t0 = _reshape_if_necessary(t0, new_shape) t1_shape = tuple(x.value for x in t1.get_shape()) num_broadcast_elements_t1 = _total_size(t1_shape[len(preserved_axes) + len(axes_to_sum):]) new_shape = t1_shape[:len(preserved_axes)] + ( num_summed_elements, num_broadcast_elements_t1) t1 = _reshape_if_necessary(t1, new_shape) product = math_ops.matmul(t0, t1) # Undo compaction of broadcast axes uncompacted_shape = ( t0_shape[:len(preserved_axes) + len(broadcast_axes[0])] + t1_shape[len(t1_shape) - len(broadcast_axes[1]):]) product = _reshape_if_necessary(product, uncompacted_shape) product_axes = ( sorted_axes[0][:len(preserved_axes) + len(broadcast_axes[0])] + sorted_axes[1][len(sorted_axes[1]) - len(broadcast_axes[1]):]) return product, ''.join(product_axes)
def _FindFusedBatchNorms(graph): """Finds all ops and tensors related to found FusedBatchNorms. Args: graph: Graph to inspect. Yields: _FusedBatchNormMatches. """ input_pattern = graph_matcher.OpTypePattern('*') weight_pattern = graph_matcher.OpTypePattern('*') gamma_pattern = graph_matcher.OpTypePattern('*') beta_pattern = graph_matcher.OpTypePattern('*') mean_pattern = graph_matcher.OpTypePattern('*') variance_pattern = graph_matcher.OpTypePattern('*') moving_average_pattern = graph_matcher.OpTypePattern('*') bn_decay_pattern = graph_matcher.OpTypePattern('*') layer_pattern = graph_matcher.OpTypePattern( 'Conv2D|DepthwiseConv2dNative|MatMul', inputs=[input_pattern, weight_pattern]) # MatMul has a Reshape between it and FusedBatchNorm. matmul_reshape_pattern = graph_matcher.OpTypePattern( 'Reshape', inputs=[layer_pattern, graph_matcher.OpTypePattern('*')]) batch_norm_pattern = graph_matcher.OpTypePattern( 'FusedBatchNorm', inputs=[ graph_matcher.OneofPattern([matmul_reshape_pattern, layer_pattern]), gamma_pattern, beta_pattern, mean_pattern, variance_pattern ]) matmul_bn_output_reshape_pattern = graph_matcher.OpTypePattern( 'Reshape', inputs=[batch_norm_pattern, graph_matcher.OpTypePattern('*')]) bn_matcher = graph_matcher.GraphMatcher( graph_matcher.OneofPattern( [matmul_bn_output_reshape_pattern, batch_norm_pattern])) moving_average_sub_pattern = graph_matcher.OpTypePattern( 'Sub', inputs=[moving_average_pattern, batch_norm_pattern]) moving_average_mul_pattern = graph_matcher.OpTypePattern( 'Mul', inputs=[moving_average_sub_pattern, bn_decay_pattern]) moving_avg_mul_matcher = graph_matcher.GraphMatcher( moving_average_mul_pattern) for match_result in bn_matcher.match_graph(graph): moving_mean_tensor = None moving_variance_tensor = None bn_decay_mean_tensor = None bn_decay_var_tensor = None layer_op = match_result.get_op(layer_pattern) layer_tensor = match_result.get_tensor(layer_pattern) bn_op = match_result.get_op(batch_norm_pattern) batch_epsilon_tensor = bn_op.get_attr('epsilon') # In the MatMul case, the output of batch norm is reshaped back into a # 2D tensor, so the output_tensor is the output of the Reshape op. output_tensor = bn_op.outputs[0] if layer_op.type == 'MatMul': output_reshape_op = match_result.get_op(matmul_bn_output_reshape_pattern) # If the matcher didn't match matmul_bn_output_reshape, there will be # another match for this 'MatMul' later, so we can skip this one. if output_reshape_op is None: continue output_tensor = output_reshape_op.outputs[0] input_tensor = match_result.get_tensor(input_pattern) weight_tensor = match_result.get_tensor(weight_pattern) gamma_tensor = match_result.get_tensor(gamma_pattern) beta_tensor = match_result.get_tensor(beta_pattern) # FusedBatchNorm in training is different from that in inference. It takes # empty 'mean' and empty 'variance', and produces the mean and the variance # of the batch. Therefore, when is_training is true, mean_tensor and # variance_tensor point to 1st and 2nd (0-based) output of bn_op, # respectively; when is_training is false, they point to bn_op's inputs. is_training = bn_op.get_attr('is_training') if is_training: # FusedBatchNormGrad doesn't compute gradients of the batch_mean and # batch_variance outputs, so we need to substitute our own custom # gradient. # TODO(suharshs, raghuramank): Find a way to avoid needing this hack. # pylint: disable=protected-access bn_op._set_attr( '_gradient_op_type', attr_value_pb2.AttrValue(s=compat.as_bytes('FoldFusedBatchNormGrad'))) # pylint: enable=protected-access mean_tensor = bn_op.outputs[1] # The batch variance used during forward and backward prop is biased, # i.e it is calculated as: V=sum(x(k)-mu)^2/N. For the moving average # calculation, the variance is corrected by the term N/N-1 (Bessel's # correction). The variance tensor read from FuseBatchNorm has bessel's # correction applied, so we undo it here. scope, sep, _ = bn_op.name.rpartition('/') g = ops.get_default_graph() with g.as_default(), g.name_scope(scope + sep): n = math_ops.cast( array_ops.size(layer_tensor) / array_ops.size(mean_tensor), dtypes.float32) variance_tensor = math_ops.multiply( bn_op.outputs[2], (n - 1) / n, name='Undo_Bessel_Correction') # TODO(suharshs): Find a way to get rid of this inner match. for mul_match_result in moving_avg_mul_matcher.match_graph(graph): sub_op = mul_match_result.get_op(moving_average_sub_pattern) if sub_op.inputs[1].name == bn_op.outputs[1].name: # During training: Batch Mean is bn_op.outputs[1] moving_mean_tensor = sub_op.inputs[0] bn_decay_mean_tensor = mul_match_result.get_tensor(bn_decay_pattern) if sub_op.inputs[1].name == bn_op.outputs[2].name: # During training: Batch Var is bn_op.outputs[2] moving_variance_tensor = sub_op.inputs[0] bn_decay_var_tensor = mul_match_result.get_tensor(bn_decay_pattern) else: mean_tensor = match_result.get_tensor(mean_pattern) variance_tensor = match_result.get_tensor(variance_pattern) yield _BatchNormMatch( layer_op=layer_op, bn_op=bn_op, output_tensor=output_tensor, input_tensor=input_tensor, weight_tensor=weight_tensor, gamma_tensor=gamma_tensor, beta_tensor=beta_tensor, mean_tensor=mean_tensor, variance_tensor=variance_tensor, moving_mean_tensor=moving_mean_tensor, moving_variance_tensor=moving_variance_tensor, bn_decay_mean_tensor=bn_decay_mean_tensor, bn_decay_var_tensor=bn_decay_var_tensor, batch_epsilon_tensor=batch_epsilon_tensor)
def testArithmeticOptimizationActive(self): """Tests that tfdbg can dump the tensor from nodes created by Grappler.""" with session.Session( config=_grappler_enabled_session_config()) as sess: u = variables.VariableV1([[1, 2], [3, 4]], name="u", dtype=dtypes.float32) # The next two ops should be optimized by Grappler into a single op: # either an AddN op or a Mul op. x = math_ops.add(u, u) x = math_ops.add(x, u) y = math_ops.multiply(x, u) sess.run(variables.global_variables_initializer()) run_options = config_pb2.RunOptions(output_partition_graphs=True) debug_utils.watch_graph(run_options, sess.graph, debug_ops=["DebugIdentity"], debug_urls=[self._debug_url]) run_metadata = config_pb2.RunMetadata() run_result = sess.run(y, options=run_options, run_metadata=run_metadata) self.assertAllClose(run_result, [[3, 12], [27, 48]]) dump_data = debug_data.DebugDumpDir( self._dump_root, partition_graphs=run_metadata.partition_graphs, validate=True) original_node_names = set( [op.name for op in sess.graph.get_operations()]) dumped_node_names = set(dump_data.nodes()) grappler_created_node_names = dumped_node_names - original_node_names grappler_removed_node_names = original_node_names - dumped_node_names # Assert that Grappler should have replaced some of the nodes from the # original graph with new nodes. self.assertTrue(grappler_created_node_names) self.assertTrue(grappler_removed_node_names) # Iterate through the nodes created by Grappler. One of them should be # be the result of replacing the original add ops with an AddN op or a # Mul op. found_optimized_node = False for grappler_node_name in grappler_created_node_names: node_op_type = dump_data.node_op_type(grappler_node_name) # Look for the node created by Grappler's arithmetic optimization. if ((test_util.IsMklEnabled() and node_op_type in ("_MklAddN", "Mul")) or (node_op_type in ("AddN", "Mul"))): datum = dump_data.get_tensors(grappler_node_name, 0, "DebugIdentity") self.assertEqual(1, len(datum)) self.assertAllClose(datum[0], [[3, 6], [9, 12]]) found_optimized_node = True break self.assertTrue( found_optimized_node, "Failed to find optimized node created by Grappler's arithmetic " "optimization.")
def _ComputeBatchNormCorrections(context, match, freeze_batch_norm_delay, fused_batch_norm): """Computes batch norm correction params. Before batch normalization is frozen: We use batch statistics for batch norm. correction_scale = sigma_b/sigma_mv correction_recip = 1/correction_scale correction_offset = 0 After batch normalization is frozen: correction_scale = sigma_b/sigma_mv correction_recip = 1 correction_offset = gamma*(mu_b/sigma_b-mu_mv/sigma_mv). Batch norm is frozen if global_step > bn_freeze_delay. The corrections ensure that: a) The weights are quantized after scaling by gamma/sigma_mv. This enables smoother training as the scaling on the weights changes slowly, rather than jump across mini-batches b) Changing the values of the corrections allows for one to switch between using batch statistics to using moving mean and average, without requiring changes to batch_norm Args: context: The scope under which we look for batch norm params match: Object containg required batch norm tensors for correction computation. freeze_batch_norm_delay: Delay in steps at which computation switches from regular batch norm to frozen mean and variance. fused_batch_norm: Bool, true if fused batch norm is used. Returns: A tuple of correction_scale, correction_recip, correction_offset """ g = ops.get_default_graph() with g.name_scope(context + '/batch_norm_correction'): recip_sigma_mv = math_ops.rsqrt( match.moving_variance_tensor + match.batch_epsilon_tensor) recip_sigma = math_ops.rsqrt( match.variance_tensor + match.batch_epsilon_tensor) correction_scale = math_ops.divide( recip_sigma_mv, recip_sigma, name='scale_compute') correction_scale = array_ops.identity( correction_scale, name='correction_scale') correction_recip = math_ops.reciprocal( correction_scale, name='reciprocal_compute') correction_offset = math_ops.multiply( match.gamma_tensor, match.mean_tensor * recip_sigma - match.moving_mean_tensor * recip_sigma_mv, name='offset_compute') if freeze_batch_norm_delay is not None: use_mv_avg = math_ops.greater_equal( common.CreateOrGetQuantizationStep(), freeze_batch_norm_delay, name='use_moving_average') else: use_mv_avg = False bn_decay_zero = 0.0 bn_decay_mean_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_var_consumers = list(match.bn_decay_mean_tensor.consumers()) bn_decay_mean_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_mean_tensor, name='freeze_moving_mean') graph_editor.reroute_ts( [bn_decay_mean_out], [match.bn_decay_mean_tensor], can_modify=bn_decay_mean_consumers) if fused_batch_norm is False: bn_decay_var_consumers = list(match.bn_decay_var_tensor.consumers()) bn_decay_var_out = utils.smart_cond( use_mv_avg, lambda: bn_decay_zero, lambda: match.bn_decay_var_tensor, name='freeze_moving_var') graph_editor.reroute_ts( [bn_decay_var_out], [match.bn_decay_var_tensor], can_modify=bn_decay_var_consumers) correction_recip = utils.smart_cond( use_mv_avg, lambda: array_ops.ones(correction_scale.shape), lambda: correction_recip, name='correction_recip') correction_offset = utils.smart_cond( use_mv_avg, lambda: correction_offset, lambda: array_ops.zeros(correction_offset.shape), name='correction_offset') return correction_scale, correction_recip, correction_offset
def _CreateFoldedOp(graph, context, has_scaling, freeze_batch_norm_delay, is_training): """Folds in batch norm layer into preceding convolution or FC layer. Creates 3 new nodes, connects their inputs and adds them to the graph: mul is cloned into mul_fold, Conv2D or MatMul, or DepthwiseConv2d is cloned into respective *_Fold, add is cloned into add_fold. Args: graph: Graph to modify. context: String, batch norm context, i.e. node into which BatchNorm is nested. has_scaling: Whether the batch norm has scaling enabled. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. is_training: Bool, true if training. Raises: ValueError: When operation type is not supported, or input and output tensor shapes mismatch for created operations: mul_fold, add_fold. Returns: A pair of Operations, the first is the original consumer node of the batch norm (../BatchNorm/batchnorm/add_1), the second is the consumer node of the folded graph (add_fold). """ mul_scale_name = 'mul_1' if has_scaling else 'mul' mul_scale = graph.get_operation_by_name(context + '/BatchNorm/batchnorm/' + mul_scale_name) op_below = mul_scale.inputs[0].op weights = op_below.inputs[1] match = _GetBatchNormParams( graph=graph, context=context, has_scaling=has_scaling) correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context=context, match=match, freeze_batch_norm_delay=freeze_batch_norm_delay, fused_batch_norm=False)) # Special handling for weights of depthwise convolution. if op_below.type == 'DepthwiseConv2dNative': new_shape = [ weights.get_shape().as_list()[2], weights.get_shape().as_list()[3] ] scale_name = 'mul' if has_scaling else 'Rsqrt' scale = graph.get_operation_by_name( context + '/BatchNorm/batchnorm/' + scale_name) scale = array_ops.reshape(scale.outputs[0], new_shape, context + '/scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape(correction_scale, new_shape, context + '/correction_reshape') with ops.device(mul_scale.device): weights = math_ops.multiply(correction_scale, weights, context + '/correction_mult') mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights), (1, scale)]) elif op_below.type in ['Conv2D', 'MatMul']: if correction_scale is not None: with ops.device(mul_scale.device): weights = math_ops.multiply(correction_scale, weights, context + '/correction_mult') mul_fold = _CloneOp(mul_scale, context + '/mul_fold', [(0, weights)]) else: raise ValueError('Cannot handle operation of type: %s' % op_below.op) _AssertShapesMatch('mul_fold', mul_fold.inputs[0], mul_fold.outputs[0]) conv_or_fc_folded = _CloneOp(op_below, op_below.name + '_Fold', [(1, mul_fold.outputs[0])]) add_shift = graph.get_operation_by_name( context + '/BatchNorm/batchnorm/add_1') corrected_output = conv_or_fc_folded.outputs[0] if correction_offset is not None: with ops.device(conv_or_fc_folded.device): corrected_output = math_ops.multiply(correction_recip, corrected_output, context + '/post_conv_mul') corrected_output = math_ops.add(corrected_output, (correction_offset), context + '/correction_add') add_fold = _CloneOp(add_shift, context + '/add_fold', [(0, corrected_output)]) _AssertShapesMatch('add_fold', add_fold.inputs[0], add_fold.outputs[0]) return add_shift, add_fold
def Inner(): z = math_ops.multiply(y, 3.0, name="z") g = gradients_impl.gradients(z, y) return g[0]
def __rmul__(self, other): return math_ops.multiply(other, self)
def _benchmark_tf_multiply_op(self, m, num_iters): func = lambda: math_ops.multiply(m, m) self._run(func, num_iters)
def __mul__(self, other): return math_ops.multiply(self, other)
def update_confusion_matrix_variables(variables_to_update, y_true, y_pred, thresholds, top_k=None, class_id=None, sample_weight=None, multi_label=False, label_weights=None): """Returns op to update the given confusion matrix variables. For every pair of values in y_true and y_pred: true_positive: y_true == True and y_pred > thresholds false_negatives: y_true == True and y_pred <= thresholds true_negatives: y_true == False and y_pred <= thresholds false_positive: y_true == False and y_pred > thresholds The results will be weighted and added together. When multiple thresholds are provided, we will repeat the same for every threshold. For estimation of these metrics over a stream of data, the function creates an `update_op` operation that updates the given variables. If `sample_weight` is `None`, weights default to 1. Use weights of 0 to mask values. Args: variables_to_update: Dictionary with 'tp', 'fn', 'tn', 'fp' as valid keys and corresponding variables to update as values. y_true: A `Tensor` whose shape matches `y_pred`. Will be cast to `bool`. y_pred: A floating point `Tensor` of arbitrary shape and whose values are in the range `[0, 1]`. thresholds: A float value, float tensor, python list, or tuple of float thresholds in `[0, 1]`, or NEG_INF (used when top_k is set). top_k: Optional int, indicates that the positive labels should be limited to the top k predictions. class_id: Optional int, limits the prediction and labels to the class specified by this argument. sample_weight: Optional `Tensor` whose rank is either 0, or the same rank as `y_true`, and must be broadcastable to `y_true` (i.e., all dimensions must be either `1`, or the same as the corresponding `y_true` dimension). multi_label: Optional boolean indicating whether multidimensional prediction/labels should be treated as multilabel responses, or flattened into a single label. When True, the valus of `variables_to_update` must have a second dimension equal to the number of labels in y_true and y_pred, and those tensors must not be RaggedTensors. label_weights: (optional) tensor of non-negative weights for multilabel data. The weights are applied when calculating TP, FP, FN, and TN without explicit multilabel handling (i.e. when the data is to be flattened). Returns: Update op. Raises: ValueError: If `y_pred` and `y_true` have mismatched shapes, or if `sample_weight` is not `None` and its shape doesn't match `y_pred`, or if `variables_to_update` contains invalid keys. """ if multi_label and label_weights is not None: raise ValueError( '`label_weights` for multilabel data should be handled ' 'outside of `update_confusion_matrix_variables` when ' '`multi_label` is True.') if variables_to_update is None: return if not any(key for key in variables_to_update if key in list(ConfusionMatrix)): raise ValueError( 'Please provide at least one valid confusion matrix ' 'variable to update. Valid variable key options are: "{}". ' 'Received: "{}"'.format(list(ConfusionMatrix), variables_to_update.keys())) variable_dtype = list(variables_to_update.values())[0].dtype y_true = math_ops.cast(y_true, dtype=variable_dtype) y_pred = math_ops.cast(y_pred, dtype=variable_dtype) thresholds = ops.convert_to_tensor_v2_with_dispatch(thresholds, dtype=variable_dtype) num_thresholds = thresholds.shape[0] if multi_label: one_thresh = math_ops.equal(math_ops.cast(1, dtype=dtypes.int32), array_ops.rank(thresholds), name='one_set_of_thresholds_cond') else: [y_pred, y_true ], _ = ragged_assert_compatible_and_get_flat_values([y_pred, y_true], sample_weight) one_thresh = math_ops.cast(True, dtype=dtypes.bool) invalid_keys = [ key for key in variables_to_update if key not in list(ConfusionMatrix) ] if invalid_keys: raise ValueError( 'Invalid keys: {}. Valid variable key options are: "{}"'.format( invalid_keys, list(ConfusionMatrix))) with ops.control_dependencies([ check_ops.assert_greater_equal(y_pred, math_ops.cast(0.0, dtype=y_pred.dtype), message='predictions must be >= 0'), check_ops.assert_less_equal(y_pred, math_ops.cast(1.0, dtype=y_pred.dtype), message='predictions must be <= 1') ]): if sample_weight is None: y_pred, y_true = losses_utils.squeeze_or_expand_dimensions( y_pred, y_true) else: sample_weight = math_ops.cast(sample_weight, dtype=variable_dtype) y_pred, y_true, sample_weight = ( losses_utils.squeeze_or_expand_dimensions( y_pred, y_true, sample_weight=sample_weight)) y_pred.shape.assert_is_compatible_with(y_true.shape) if top_k is not None: y_pred = _filter_top_k(y_pred, top_k) if class_id is not None: y_true = y_true[..., class_id] y_pred = y_pred[..., class_id] pred_shape = array_ops.shape(y_pred) num_predictions = pred_shape[0] if y_pred.shape.ndims == 1: num_labels = 1 else: num_labels = gen_math_ops.Prod(input=pred_shape[1:], axis=0) thresh_label_tile = control_flow_ops.cond( one_thresh, lambda: num_labels, lambda: math_ops.cast(1, dtype=dtypes.int32)) # Reshape predictions and labels, adding a dim for thresholding. if multi_label: predictions_extra_dim = array_ops.expand_dims(y_pred, 0) labels_extra_dim = array_ops.expand_dims( math_ops.cast(y_true, dtype=dtypes.bool), 0) else: # Flatten predictions and labels when not multilabel. predictions_extra_dim = array_ops.reshape(y_pred, [1, -1]) labels_extra_dim = array_ops.reshape( math_ops.cast(y_true, dtype=dtypes.bool), [1, -1]) # Tile the thresholds for every prediction. if multi_label: thresh_pretile_shape = [num_thresholds, 1, -1] thresh_tiles = [1, num_predictions, thresh_label_tile] data_tiles = [num_thresholds, 1, 1] else: thresh_pretile_shape = [num_thresholds, -1] thresh_tiles = [1, num_predictions * num_labels] data_tiles = [num_thresholds, 1] thresh_tiled = array_ops.tile( array_ops.reshape(thresholds, thresh_pretile_shape), array_ops.stack(thresh_tiles)) # Tile the predictions for every threshold. preds_tiled = array_ops.tile(predictions_extra_dim, data_tiles) # Compare predictions and threshold. pred_is_pos = math_ops.greater(preds_tiled, thresh_tiled) # Tile labels by number of thresholds label_is_pos = array_ops.tile(labels_extra_dim, data_tiles) if sample_weight is not None: sample_weight = weights_broadcast_ops.broadcast_weights( math_ops.cast(sample_weight, dtype=variable_dtype), y_pred) weights_tiled = array_ops.tile( array_ops.reshape(sample_weight, thresh_tiles), data_tiles) else: weights_tiled = None if label_weights is not None and not multi_label: label_weights = array_ops.expand_dims(label_weights, 0) label_weights = weights_broadcast_ops.broadcast_weights( label_weights, y_pred) label_weights_tiled = array_ops.tile( array_ops.reshape(label_weights, thresh_tiles), data_tiles) if weights_tiled is None: weights_tiled = label_weights_tiled else: weights_tiled = math_ops.multiply(weights_tiled, label_weights_tiled) update_ops = [] def weighted_assign_add(label, pred, weights, var): label_and_pred = math_ops.cast(math_ops.logical_and(label, pred), dtype=var.dtype) if weights is not None: label_and_pred *= math_ops.cast(weights, dtype=var.dtype) return var.assign_add(math_ops.reduce_sum(label_and_pred, 1)) loop_vars = { ConfusionMatrix.TRUE_POSITIVES: (label_is_pos, pred_is_pos), } update_tn = ConfusionMatrix.TRUE_NEGATIVES in variables_to_update update_fp = ConfusionMatrix.FALSE_POSITIVES in variables_to_update update_fn = ConfusionMatrix.FALSE_NEGATIVES in variables_to_update if update_fn or update_tn: pred_is_neg = math_ops.logical_not(pred_is_pos) loop_vars[ConfusionMatrix.FALSE_NEGATIVES] = (label_is_pos, pred_is_neg) if update_fp or update_tn: label_is_neg = math_ops.logical_not(label_is_pos) loop_vars[ConfusionMatrix.FALSE_POSITIVES] = (label_is_neg, pred_is_pos) if update_tn: loop_vars[ConfusionMatrix.TRUE_NEGATIVES] = (label_is_neg, pred_is_neg) for matrix_cond, (label, pred) in loop_vars.items(): if matrix_cond in variables_to_update: update_ops.append( weighted_assign_add(label, pred, weights_tiled, variables_to_update[matrix_cond])) return control_flow_ops.group(update_ops)
def lifted_struct_loss(labels, embeddings, margin=1.0): """Computes the lifted structured loss. The loss encourages the positive distances (between a pair of embeddings with the same labels) to be smaller than any negative distances (between a pair of embeddings with different labels) in the mini-batch in a way that is differentiable with respect to the embedding vectors. See: https://arxiv.org/abs/1511.06452. Args: labels: 1-D tf.int32 `Tensor` with shape [batch_size] of multiclass integer labels. embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should not be l2 normalized. margin: Float, margin term in the loss definition. Returns: lifted_loss: tf.float32 scalar. """ # Reshape [batch_size] label tensor to a [batch_size, 1] label tensor. lshape = array_ops.shape(labels) assert lshape.shape == 1 labels = array_ops.reshape(labels, [lshape[0], 1]) # Build pairwise squared distance matrix. pairwise_distances = pairwise_distance(embeddings) # Build pairwise binary adjacency matrix. adjacency = math_ops.equal(labels, array_ops.transpose(labels)) # Invert so we can select negatives only. adjacency_not = math_ops.logical_not(adjacency) batch_size = array_ops.size(labels) diff = margin - pairwise_distances mask = math_ops.cast(adjacency_not, dtype=dtypes.float32) # Safe maximum: Temporarily shift negative distances # above zero before taking max. # this is to take the max only among negatives. row_minimums = math_ops.reduce_min(diff, 1, keep_dims=True) row_negative_maximums = math_ops.reduce_max( math_ops.multiply( diff - row_minimums, mask), 1, keep_dims=True) + row_minimums # Compute the loss. # Keep track of matrix of maximums where M_ij = max(m_i, m_j) # where m_i is the max of alpha - negative D_i's. # This matches the Caffe loss layer implementation at: # https://github.com/rksltnl/Caffe-Deep-Metric-Learning-CVPR16/blob/0efd7544a9846f58df923c8b992198ba5c355454/src/caffe/layers/lifted_struct_similarity_softmax_layer.cpp # pylint: disable=line-too-long max_elements = math_ops.maximum( row_negative_maximums, array_ops.transpose(row_negative_maximums)) diff_tiled = array_ops.tile(diff, [batch_size, 1]) mask_tiled = array_ops.tile(mask, [batch_size, 1]) max_elements_vect = array_ops.reshape( array_ops.transpose(max_elements), [-1, 1]) loss_exp_left = array_ops.reshape( math_ops.reduce_sum(math_ops.multiply( math_ops.exp( diff_tiled - max_elements_vect), mask_tiled), 1, keep_dims=True), [batch_size, batch_size]) loss_mat = max_elements + math_ops.log( loss_exp_left + array_ops.transpose(loss_exp_left)) # Add the positive distance. loss_mat += pairwise_distances mask_positives = math_ops.cast( adjacency, dtype=dtypes.float32) - array_ops.diag( array_ops.ones([batch_size])) # *0.5 for upper triangular, and another *0.5 for 1/2 factor for loss^2. num_positives = math_ops.reduce_sum(mask_positives) / 2.0 lifted_loss = math_ops.truediv( 0.25 * math_ops.reduce_sum( math_ops.square( math_ops.maximum( math_ops.multiply(loss_mat, mask_positives), 0.0))), num_positives, name='liftedstruct_loss') return lifted_loss
def polynomial_decay(learning_rate, global_step, decay_steps, end_learning_rate=0.0001, power=1.0, cycle=False, name=None): """Applies a polynomial decay to the learning rate. It is commonly observed that a monotonically decreasing learning rate, whose degree of change is carefully chosen, results in a better performing model. This function applies a polynomial decay function to a provided initial `learning_rate` to reach an `end_learning_rate` in the given `decay_steps`. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python global_step = min(global_step, decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate ``` If `cycle` is True then a multiple of `decay_steps` is used, the first one that is bigger than `global_steps`. ```python decay_steps = decay_steps * ceil(global_step / decay_steps) decayed_learning_rate = (learning_rate - end_learning_rate) * (1 - global_step / decay_steps) ^ (power) + end_learning_rate ``` Example: decay from 0.1 to 0.01 in 10000 steps using sqrt (i.e. power=0.5): ```python ... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 end_learning_rate = 0.01 decay_steps = 10000 learning_rate = tf.train.polynomial_decay(starter_learning_rate, global_step, decay_steps, end_learning_rate, power=0.5) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must be positive. See the decay computation above. end_learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The minimal end learning rate. power: A scalar `float32` or `float64` `Tensor` or a Python number. The power of the polynomial. Defaults to sqrt, i.e. 0.5. cycle: A boolean, whether or not it should cycle beyond decay_steps. name: String. Optional name of the operation. Defaults to 'PolynomialDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. """ if global_step is None: raise ValueError("global_step is required for polynomial_decay.") with ops.name_scope( name, "PolynomialDecay", [learning_rate, global_step, decay_steps, end_learning_rate, power ]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) decay_steps = math_ops.cast(decay_steps, dtype) end_learning_rate = math_ops.cast(end_learning_rate, dtype) power = math_ops.cast(power, dtype) if cycle: # Find the first multiple of decay_steps that is bigger than global_step. decay_steps = math_ops.multiply( decay_steps, math_ops.ceil(global_step / decay_steps)) else: # Make sure that the global_step used is not bigger than decay_steps. global_step = math_ops.minimum(global_step, decay_steps) p = math_ops.div(global_step, decay_steps) return math_ops.add(math_ops.multiply( learning_rate - end_learning_rate, math_ops.pow(1 - p, power)), end_learning_rate, name=name)
def loss_fn(ev): emb = embedding_ops.embedding_lookup( ev, math_ops.cast([0, 1, 2, 5, 6, 7], dtypes.int64)) fun = math_ops.multiply(emb, 2.0, name='multiply') loss = math_ops.reduce_sum(fun, name='reduce_sum') return loss
def weighted_moments(x, axes, frequency_weights, name=None, keep_dims=False): """Returns the frequency-weighted mean and variance of `x`. Args: x: A tensor. axes: 1-d tensor of int32 values; these are the axes along which to compute mean and variance. frequency_weights: A tensor of positive weights which can be broadcast with x. name: Name used to scope the operation. keep_dims: Produce moments with the same dimensionality as the input. Returns: Two tensors: `weighted_mean` and `weighted_variance`. """ with ops.name_scope(name, "weighted_moments", [x, frequency_weights, axes]): x = ops.convert_to_tensor(x, name="x") frequency_weights = ops.convert_to_tensor( frequency_weights, name="frequency_weights") # Unlike moments(), this just uses a simpler two-pass method. # See comment in moments() WRT precision; it applies here too. needs_cast = x.dtype == dtypes.float16 if needs_cast: x = math_ops.cast(x, dtypes.float32) if frequency_weights.dtype != x.dtype: frequency_weights = math_ops.cast(frequency_weights, x.dtype) # Note that we use keep_dims=True for our reductions regardless of the arg; # this is so that the results remain broadcast-compatible with the inputs. weighted_input_sum = math_ops.reduce_sum( frequency_weights * x, axes, name="weighted_input_sum", keep_dims=True) # The shape of the weights isn't necessarily the same as x's # shape, just broadcast-compatible with it -- so this expression # performs broadcasting to give a per-item weight, with the same # shape as (freqency_weights * x). This avoids having to reason # through all the broadcast logic to compute a correct # sum_of_weights. broadcasted_weights = frequency_weights + array_ops.zeros_like(x) sum_of_weights = math_ops.reduce_sum( broadcasted_weights, axes, name="sum_of_weights", keep_dims=True) divisor = math_ops.reciprocal(sum_of_weights, name="inv_weight_sum") weighted_mean = math_ops.multiply(weighted_input_sum, divisor) # Have the weighted mean; now on to variance: weighted_distsq = math_ops.reduce_sum( frequency_weights * math_ops.squared_difference(x, weighted_mean), axes, name="weighted_distsq", keep_dims=True) weighted_variance = math_ops.multiply(weighted_distsq, divisor) if not keep_dims: weighted_mean = array_ops.squeeze(weighted_mean, squeeze_dims=axes) weighted_variance = array_ops.squeeze( weighted_variance, squeeze_dims=axes) if needs_cast: weighted_mean = math_ops.cast(weighted_mean, dtypes.float16) weighted_variance = math_ops.cast(weighted_variance, dtypes.float16) return weighted_mean, weighted_variance
def inverse_time_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies inverse time decay to the initial learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an inverse decay function to a provided initial learning rate. It requires an `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate / (1 + decay_rate * t) ``` Example: decay 1/t with a rate of 0.5: ```python ... global_step = tf.Variable(0, trainable=False) learning_rate = 0.1 k = 0.5 learning_rate = tf.train.inverse_time_decay(learning_rate, global_step, k) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A Python number. Global step to use for the decay computation. Must not be negative. decay_steps: How often to apply decay. decay_rate: A Python number. The decay rate. staircase: Whether to apply decay in a discrete staircase, as opposed to continuous, fashion. name: String. Optional name of the operation. Defaults to 'InverseTimeDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. """ if global_step is None: raise ValueError("global_step is required for inverse_time_decay.") with ops.name_scope(name, "InverseTimeDecay", [learning_rate, global_step, decay_rate]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) decay_steps = math_ops.cast(decay_steps, dtype) decay_rate = math_ops.cast(decay_rate, dtype) p = global_step / decay_steps if staircase: p = math_ops.floor(p) const = math_ops.cast(constant_op.constant(1), learning_rate.dtype) denom = math_ops.add(const, math_ops.multiply(decay_rate, p)) return math_ops.div(learning_rate, denom, name=name)
def _CloneMul(self, op, inputs, new_name): del op # Unused. return math_ops.multiply(inputs[0], inputs[1], name=new_name).op
def exponential_decay(learning_rate, global_step, decay_steps, decay_rate, staircase=False, name=None): """Applies exponential decay to the learning rate. When training a model, it is often recommended to lower the learning rate as the training progresses. This function applies an exponential decay function to a provided initial learning rate. It requires a `global_step` value to compute the decayed learning rate. You can just pass a TensorFlow variable that you increment at each training step. The function returns the decayed learning rate. It is computed as: ```python decayed_learning_rate = learning_rate * decay_rate ^ (global_step / decay_steps) ``` If the argument `staircase` is `True`, then `global_step / decay_steps` is an integer division and the decayed learning rate follows a staircase function. Example: decay every 100000 steps with a base of 0.96: ```python ... global_step = tf.Variable(0, trainable=False) starter_learning_rate = 0.1 learning_rate = tf.train.exponential_decay(starter_learning_rate, global_step, 100000, 0.96, staircase=True) # Passing global_step to minimize() will increment it at each step. learning_step = ( tf.train.GradientDescentOptimizer(learning_rate) .minimize(...my loss..., global_step=global_step) ) ``` Args: learning_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The initial learning rate. global_step: A scalar `int32` or `int64` `Tensor` or a Python number. Global step to use for the decay computation. Must not be negative. decay_steps: A scalar `int32` or `int64` `Tensor` or a Python number. Must be positive. See the decay computation above. decay_rate: A scalar `float32` or `float64` `Tensor` or a Python number. The decay rate. staircase: Boolean. If `True` decay the learning rate at discrete intervals name: String. Optional name of the operation. Defaults to 'ExponentialDecay'. Returns: A scalar `Tensor` of the same type as `learning_rate`. The decayed learning rate. Raises: ValueError: if `global_step` is not supplied. """ if global_step is None: raise ValueError("global_step is required for exponential_decay.") with ops.name_scope( name, "ExponentialDecay", [learning_rate, global_step, decay_steps, decay_rate]) as name: learning_rate = ops.convert_to_tensor(learning_rate, name="learning_rate") dtype = learning_rate.dtype global_step = math_ops.cast(global_step, dtype) decay_steps = math_ops.cast(decay_steps, dtype) decay_rate = math_ops.cast(decay_rate, dtype) p = global_step / decay_steps if staircase: p = math_ops.floor(p) return math_ops.multiply(learning_rate, math_ops.pow(decay_rate, p), name=name)
def huber_loss(labels, predictions, weights=1.0, delta=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES, reduction=Reduction.SUM_BY_NONZERO_WEIGHTS): """Adds a Huber Loss term to the training procedure. For each value x in `error=labels-predictions`, the following is calculated: ``` 0.5 * x^2 if |x| <= d 0.5 * d^2 + d * (|x| - d) if |x| > d ``` where d is `delta`. See: https://en.wikipedia.org/wiki/Huber_loss `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. If the shape of `weights` matches the shape of `predictions`, then the loss of each measurable element of `predictions` is scaled by the corresponding value of `weights`. Args: labels: The ground truth output tensor, same dimensions as 'predictions'. predictions: The predicted outputs. weights: Optional `Tensor` whose rank is either 0, or the same rank as `labels`, and must be broadcastable to `labels` (i.e., all dimensions must be either `1`, or the same as the corresponding `losses` dimension). delta: `float`, the point where the huber loss function changes from a quadratic to linear. scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. reduction: Type of reduction to apply to loss. Returns: Weighted loss float `Tensor`. If `reduction` is `NONE`, this has the same shape as `labels`; otherwise, it is scalar. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "huber_loss", (predictions, labels, weights)) as scope: predictions = math_ops.cast(predictions, dtype=dtypes.float32) labels = math_ops.cast(labels, dtype=dtypes.float32) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) error = math_ops.subtract(predictions, labels) abs_error = math_ops.abs(error) quadratic = math_ops.minimum(abs_error, delta) # The following expression is the same in value as # tf.maximum(abs_error - delta, 0), but importantly the gradient for the # expression when abs_error == delta is 0 (for tf.maximum it would be 1). # This is necessary to avoid doubling the gradient, since there is already a # nonzero contribution to the gradient from the quadratic term. linear = math_ops.subtract(abs_error, quadratic) losses = math_ops.add( math_ops.multiply( ops.convert_to_tensor(0.5, dtype=quadratic.dtype), math_ops.multiply(quadratic, quadratic)), math_ops.multiply(delta, linear)) return compute_weighted_loss( losses, weights, scope, loss_collection, reduction=reduction)
def f(x): return (math_ops.multiply(obj.v, x), math_ops.multiply(obj.v, (x + 1)), None)
def _compute_sampled_logits(weights, biases, labels, inputs, num_sampled, num_classes, num_true=1, sampled_values=None, subtract_log_q=True, remove_accidental_hits=False, partition_strategy="mod", name=None): """Helper function for nce_loss and sampled_softmax_loss functions. Computes sampled output training logits and labels suitable for implementing e.g. noise-contrastive estimation (see nce_loss) or sampled softmax (see sampled_softmax_loss). Note: In the case where num_true > 1, we assign to each target class the target probability 1 / num_true so that the target probabilities sum to 1 per-example. Args: weights: A `Tensor` of shape `[num_classes, dim]`, or a list of `Tensor` objects whose concatenation along dimension 0 has shape `[num_classes, dim]`. The (possibly-partitioned) class embeddings. biases: A `Tensor` of shape `[num_classes]`. The class biases. labels: A `Tensor` of type `int64` and shape `[batch_size, num_true]`. The target classes. Note that this format differs from the `labels` argument of `nn.softmax_cross_entropy_with_logits`. inputs: A `Tensor` of shape `[batch_size, dim]`. The forward activations of the input network. num_sampled: An `int`. The number of classes to randomly sample per batch. num_classes: An `int`. The number of possible classes. num_true: An `int`. The number of target classes per training example. sampled_values: a tuple of (`sampled_candidates`, `true_expected_count`, `sampled_expected_count`) returned by a `*_candidate_sampler` function. (if None, we default to `log_uniform_candidate_sampler`) subtract_log_q: A `bool`. whether to subtract the log expected count of the labels in the sample to get the logits of the true labels. Default is True. Turn off for Negative Sampling. remove_accidental_hits: A `bool`. whether to remove "accidental hits" where a sampled class equals one of the target classes. Default is False. partition_strategy: A string specifying the partitioning strategy, relevant if `len(weights) > 1`. Currently `"div"` and `"mod"` are supported. Default is `"mod"`. See `tf.nn.embedding_lookup` for more details. name: A name for the operation (optional). Returns: out_logits, out_labels: `Tensor` objects each with shape `[batch_size, num_true + num_sampled]`, for passing to either `nn.sigmoid_cross_entropy_with_logits` (NCE) or `nn.softmax_cross_entropy_with_logits` (sampled softmax). """ if isinstance(weights, variables.PartitionedVariable): weights = list(weights) if not isinstance(weights, list): weights = [weights] with ops.name_scope(name, "compute_sampled_logits", weights + [biases, inputs, labels]): if labels.dtype != dtypes.int64: labels = math_ops.cast(labels, dtypes.int64) labels_flat = array_ops.reshape(labels, [-1]) # Sample the negative labels. # sampled shape: [num_sampled] tensor # true_expected_count shape = [batch_size, 1] tensor # sampled_expected_count shape = [num_sampled] tensor if sampled_values is None: sampled_values = candidate_sampling_ops.log_uniform_candidate_sampler( true_classes=labels, num_true=num_true, num_sampled=num_sampled, unique=True, range_max=num_classes) # NOTE: pylint cannot tell that 'sampled_values' is a sequence # pylint: disable=unpacking-non-sequence sampled, true_expected_count, sampled_expected_count = sampled_values # pylint: enable=unpacking-non-sequence # labels_flat is a [batch_size * num_true] tensor # sampled is a [num_sampled] int tensor all_ids = array_ops.concat([labels_flat, sampled], 0) # weights shape is [num_classes, dim] all_w = embedding_ops.embedding_lookup( weights, all_ids, partition_strategy=partition_strategy) all_b = embedding_ops.embedding_lookup(biases, all_ids) # true_w shape is [batch_size * num_true, dim] # true_b is a [batch_size * num_true] tensor true_w = array_ops.slice( all_w, [0, 0], array_ops.stack([array_ops.shape(labels_flat)[0], -1])) true_b = array_ops.slice(all_b, [0], array_ops.shape(labels_flat)) # inputs shape is [batch_size, dim] # true_w shape is [batch_size * num_true, dim] # row_wise_dots is [batch_size, num_true, dim] dim = array_ops.shape(true_w)[1:2] new_true_w_shape = array_ops.concat([[-1, num_true], dim], 0) row_wise_dots = math_ops.multiply( array_ops.expand_dims(inputs, 1), array_ops.reshape(true_w, new_true_w_shape)) # We want the row-wise dot plus biases which yields a # [batch_size, num_true] tensor of true_logits. dots_as_matrix = array_ops.reshape(row_wise_dots, array_ops.concat([[-1], dim], 0)) true_logits = array_ops.reshape(_sum_rows(dots_as_matrix), [-1, num_true]) true_b = array_ops.reshape(true_b, [-1, num_true]) true_logits += true_b # Lookup weights and biases for sampled labels. # sampled_w shape is [num_sampled, dim] # sampled_b is a [num_sampled] float tensor sampled_w = array_ops.slice( all_w, array_ops.stack([array_ops.shape(labels_flat)[0], 0]), [-1, -1]) sampled_b = array_ops.slice(all_b, array_ops.shape(labels_flat), [-1]) # inputs has shape [batch_size, dim] # sampled_w has shape [num_sampled, dim] # sampled_b has shape [num_sampled] # Apply X*W'+B, which yields [batch_size, num_sampled] sampled_logits = math_ops.matmul( inputs, sampled_w, transpose_b=True) + sampled_b if remove_accidental_hits: acc_hits = candidate_sampling_ops.compute_accidental_hits( labels, sampled, num_true=num_true) acc_indices, acc_ids, acc_weights = acc_hits # This is how SparseToDense expects the indices. acc_indices_2d = array_ops.reshape(acc_indices, [-1, 1]) acc_ids_2d_int32 = array_ops.reshape( math_ops.cast(acc_ids, dtypes.int32), [-1, 1]) sparse_indices = array_ops.concat([acc_indices_2d, acc_ids_2d_int32], 1, "sparse_indices") # Create sampled_logits_shape = [batch_size, num_sampled] sampled_logits_shape = array_ops.concat( [array_ops.shape(labels)[:1], array_ops.expand_dims(num_sampled, 0)], 0) if sampled_logits.dtype != acc_weights.dtype: acc_weights = math_ops.cast(acc_weights, sampled_logits.dtype) sampled_logits += sparse_ops.sparse_to_dense( sparse_indices, sampled_logits_shape, acc_weights, default_value=0.0, validate_indices=False) if subtract_log_q: # Subtract log of Q(l), prior probability that l appears in sampled. true_logits -= math_ops.log(true_expected_count) sampled_logits -= math_ops.log(sampled_expected_count) # Construct output logits and labels. The true labels/logits start at col 0. out_logits = array_ops.concat([true_logits, sampled_logits], 1) # true_logits is a float tensor, ones_like(true_logits) is a float tensor # of ones. We then divide by num_true to ensure the per-example labels sum # to 1.0, i.e. form a proper probability distribution. out_labels = array_ops.concat([ array_ops.ones_like(true_logits) / num_true, array_ops.zeros_like(sampled_logits) ], 1) return out_logits, out_labels
def Foo(): x = constant_op.constant(10.0, name="x") y = math_ops.multiply(x, c, name="y") z = math_ops.multiply(y, 3.0, name="z") g = gradients_impl.gradients(z, x) return g[0]
def mean_pairwise_squared_error( labels, predictions, weights=1.0, scope=None, loss_collection=ops.GraphKeys.LOSSES): """Adds a pairwise-errors-squared loss to the training procedure. Unlike `mean_squared_error`, which is a measure of the differences between corresponding elements of `predictions` and `labels`, `mean_pairwise_squared_error` is a measure of the differences between pairs of corresponding elements of `predictions` and `labels`. For example, if `labels`=[a, b, c] and `predictions`=[x, y, z], there are three pairs of differences are summed to compute the loss: loss = [ ((a-b) - (x-y)).^2 + ((a-c) - (x-z)).^2 + ((b-c) - (y-z)).^2 ] / 3 Note that since the inputs are of shape `[batch_size, d0, ... dN]`, the corresponding pairs are computed within each batch sample but not across samples within a batch. For example, if `predictions` represents a batch of 16 grayscale images of dimension [batch_size, 100, 200], then the set of pairs is drawn from each image, but not across images. `weights` acts as a coefficient for the loss. If a scalar is provided, then the loss is simply scaled by the given value. If `weights` is a tensor of size `[batch_size]`, then the total loss for each sample of the batch is rescaled by the corresponding element in the `weights` vector. Args: labels: The ground truth output tensor, whose shape must match the shape of `predictions`. predictions: The predicted outputs, a tensor of size `[batch_size, d0, .. dN]` where N+1 is the total number of dimensions in `predictions`. weights: Coefficients for the loss a scalar, a tensor of shape `[batch_size]` or a tensor whose shape matches `predictions`. scope: The scope for the operations performed in computing the loss. loss_collection: collection to which the loss will be added. Returns: A scalar `Tensor` that returns the weighted loss. Raises: ValueError: If the shape of `predictions` doesn't match that of `labels` or if the shape of `weights` is invalid. Also if `labels` or `predictions` is None. @compatibility(eager) The `loss_collection` argument is ignored when executing eagerly. Consider holding on to the return value or collecting losses via a `tf.keras.Model`. @end_compatibility """ if labels is None: raise ValueError("labels must not be None.") if predictions is None: raise ValueError("predictions must not be None.") with ops.name_scope(scope, "mean_pairwise_squared_error", (predictions, labels, weights)) as scope: weights = math_ops.cast(weights, dtype=dtypes.float32) labels = math_ops.cast(labels, dtype=dtypes.float32) with ops.control_dependencies(( weights_broadcast_ops.assert_broadcastable(weights, labels),)): predictions = math_ops.cast(predictions, dtype=dtypes.float32) predictions.get_shape().assert_is_compatible_with(labels.get_shape()) diffs = math_ops.subtract(predictions, labels) axis = math_ops.range(1, array_ops.rank(diffs)) sum_squares_diff_per_batch = math_ops.reduce_sum( math_ops.square(diffs), axis=axis, keepdims=True) num_present_per_batch = _num_present(diffs, weights, per_batch=True) term1 = 2.0 * math_ops.div_no_nan( sum_squares_diff_per_batch, math_ops.maximum(num_present_per_batch - 1, 0), name="value") sum_diff = math_ops.reduce_sum(diffs, axis=axis, keepdims=True) term2 = 2.0 * math_ops.div_no_nan( math_ops.square(sum_diff), math_ops.maximum( math_ops.multiply(num_present_per_batch, num_present_per_batch - 1), 0), name="value") weighted_losses = math_ops.multiply(term1 - term2, weights) loss = math_ops.reduce_sum(weighted_losses) mean_loss = array_ops.where( math_ops.reduce_sum(num_present_per_batch) > 0, loss, array_ops.zeros_like(loss), name="value") util.add_loss(mean_loss, loss_collection) return mean_loss
def Foo(): y = math_ops.multiply(var, 2.0, name="y") g = gradients_impl.gradients(y, var) return g[0]
def _FoldFusedBatchNorms(graph, is_training, freeze_batch_norm_delay): """Finds fused batch norm layers and folds them into preceding layers. Folding only affects the following layers: Conv2D, fully connected, depthwise convolution. Args: graph: Graph to walk and modify. is_training: Bool, true if training. freeze_batch_norm_delay: How many steps to wait before freezing moving mean and variance and using them for batch normalization. Raises: ValueError: When batch norm folding fails. """ for match in _FindFusedBatchNorms(graph): scope, sep, _ = match.layer_op.name.rpartition('/') # Make sure new ops are added to `graph` and put on the same device as # `bn_op`. The '/' (i.e. `sep`) ensures that we reuse the existing scope # named `scope`. Otherwise, TF creates a unique scope whose name starts with # `scope`. with graph.as_default(), graph.name_scope(scope + sep): with graph.name_scope(scope + sep + 'BatchNorm_Fold' + sep): # new weights = old weights * gamma / sqrt(variance + epsilon) # new biases = -mean * gamma / sqrt(variance + epsilon) + beta multiplier_tensor = match.gamma_tensor * math_ops.rsqrt( match.variance_tensor + match.bn_op.get_attr('epsilon')) bias_tensor = math_ops.subtract( match.beta_tensor, match.mean_tensor * multiplier_tensor, name='bias') correction_scale, correction_recip, correction_offset = None, None, None if is_training: correction_scale, correction_recip, correction_offset = ( _ComputeBatchNormCorrections( context='', match=match, freeze_batch_norm_delay=freeze_batch_norm_delay, fused_batch_norm=True)) # The shape of depthwise weights is different, so we need to reshape the # multiplier_tensor to ensure that the scaled_weight_tensor has the # expected shape. weights = match.weight_tensor if match.layer_op.type == 'DepthwiseConv2dNative': new_shape = [ match.weight_tensor.get_shape().as_list()[2], match.weight_tensor.get_shape().as_list()[3] ] multiplier_tensor = array_ops.reshape( multiplier_tensor, new_shape, name='scale_reshape') if correction_scale is not None: correction_scale = array_ops.reshape( correction_scale, new_shape, name='correction_reshape') if correction_scale is not None: weights = math_ops.multiply( correction_scale, weights, name='correction_mult') scaled_weight_tensor = math_ops.multiply( weights, multiplier_tensor, name='mul_fold') new_layer_tensor = _CloneWithNewOperands( match.layer_op, match.input_tensor, scaled_weight_tensor) if correction_recip is not None: new_layer_tensor = math_ops.multiply( correction_recip, new_layer_tensor, name='post_conv_mul') new_layer_tensor = math_ops.add(new_layer_tensor, (correction_offset), 'correction_add') bias_add_tensor = math_ops.add( new_layer_tensor, bias_tensor, name='add_fold') nodes_modified_count = graph_editor.reroute_ts(bias_add_tensor, match.output_tensor) if nodes_modified_count != 1: raise ValueError( 'Unexpected inputs to op: %s' % match.output_tensor.name)