def testUnknownUnconnectedGradientsValueGiven(self): with ops.Graph().as_default(): x = constant(1.0) y = constant(1.0) with self.assertRaisesRegexp( ValueError, "Unknown value for unconnected_gradients: 'nonsense'"): gradients.gradients([y], [x], unconnected_gradients="nonsense")
def testRealOnly(self): x = constant_op.constant(7+3j, dtype=dtypes.complex64) y = math_ops.square(x) with self.assertRaisesRegexp( TypeError, r"Gradients of complex tensors must set grad_ys " r"\(y\.dtype = tf\.complex64\)"): gradients.gradients(y, x)
def testPartialDerivatives(self): with self.test_session(): x = constant_op.constant(1.) y = 2 * x z = x + y totalg = gradients.gradients(z, [x, y]) self.assertEqual([3.0, 1.0], [g.eval() for g in totalg]) partialg = gradients.gradients(z, [x, y], stop_gradients=[x, y]) self.assertEqual([1.0, 1.0], [g.eval() for g in partialg])
def testFloorDivGrad(self): with self.test_session(): a = variables.Variable(2.0) b = variables.Variable(4.0) with self.test_session() as sess: sess.run(variables.initialize_all_variables()) c_grad = gradients.gradients(math_ops.div_deprecated(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [0.25, -0.125]) c_grad = gradients.gradients(math_ops.div(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [0.25, -0.125]) c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b]) self.assertAllEqual([None if x is None else x.eval() for x in c_grad], [None, None])
def testFloorDivGrad(self): with self.test_session(): a = variables.Variable(2.) b = variables.Variable(4.) with self.test_session() as sess: sess.run(variables.global_variables_initializer()) c_grad = gradients.gradients(math_ops.divide(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125]) c_grad = gradients.gradients(math_ops.div(a, b), [a, b]) self.assertAllEqual([x.eval() for x in c_grad], [.25, -.125]) c_grad = gradients.gradients(math_ops.floordiv(a, b), [a, b]) self.assertAllEqual([None if x is None else x.eval() for x in c_grad], [None, None])
def testDependentYs(self): with self.test_session(): x = constant_op.constant(3.0) y = math_ops.square(x) y1 = math_ops.square(y) y2 = math_ops.square(y1) g = gradients.gradients([y, y2], x) self.assertAllClose(17502.0, g[0].eval()) g = gradients.gradients(y + y2, x) self.assertAllClose(17502.0, g[0].eval()) z = array_ops.identity(y) z2 = array_ops.identity(y2) g = gradients.gradients([z, z2], x) self.assertAllClose(17502.0, g[0].eval())
def test_jacobian_fixed_shape(self): x = random_ops.random_uniform([2, 2]) y = math_ops.matmul(x, x, transpose_a=True) jacobian_pfor = gradients.jacobian(y, x, use_pfor=True) jacobian_while = gradients.jacobian(y, x, use_pfor=False) answer = ops.convert_to_tensor([[ gradient_ops.gradients(y[0][0], x)[0], gradient_ops.gradients(y[0][1], x)[0] ], [ gradient_ops.gradients(y[1][0], x)[0], gradient_ops.gradients(y[1][1], x)[0] ]]) self.run_and_assert_equal(answer, jacobian_pfor) self.run_and_assert_equal(answer, jacobian_while)
def testColocateGradientsWithAggregation(self): with ops.Graph().as_default() as g: with g.device("/gpu:1"): w = constant(1.0, shape=[1, 1]) x = constant(1.0, shape=[1, 2]) y = constant(1.0, shape=[1, 2]) wx = math_ops.matmul(w, x) wy = math_ops.matmul(w, y) with g.device("/gpu:0"): z = wx + wy gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0] self.assertEquals("/gpu:1", gw1.device) gw2 = gradients.gradients(z, [w], colocate_gradients_with_ops=False)[0] self.assertEquals(None, gw2.device)
def testCustomGradientErrors(self): @custom_gradient.custom_gradient def F(x): def Grad(_): raise RuntimeError("x") return x, Grad with ops.Graph().as_default(): x = constant(1.0) y = F(x) with self.assertRaises(RuntimeError): gradients.gradients(y, x)
def testBatchNormGradImpl(self): x_shape = [7, 5, 4, 6] param_shape = [6] np.random.seed(1) # Make it reproducible. x_val = np.random.random_sample(x_shape).astype(np.float32) m_val = np.random.random_sample(param_shape).astype(np.float32) v_val = np.random.random_sample(param_shape).astype(np.float32) beta_val = np.random.random_sample(param_shape).astype(np.float32) gamma_val = np.random.random_sample(param_shape).astype(np.float32) backprop_val = np.random.random_sample(x_shape).astype(np.float32) for use_gpu in [False, True]: with self.test_session(use_gpu=use_gpu) as sess: x = constant_op.constant(x_val, name="x") m = constant_op.constant(m_val, name="m") v = constant_op.constant(v_val, name="v") beta = constant_op.constant(beta_val, name="beta") gamma = constant_op.constant(gamma_val, name="gamma") backprop = constant_op.constant(backprop_val, name="backprop") epsilon = 0.001 for scale_after_normalization in [True, False]: dx, dm, dv, db, dg = gen_nn_ops._batch_norm_with_global_normalization_grad( x, m, v, gamma, backprop, epsilon, scale_after_normalization ) on = self._opsBatchNorm(x, m, v, beta, gamma, epsilon, scale_after_normalization) odx, odm, odv, odb, odg = gradients.gradients([on], [x, m, v, beta, gamma], [backprop]) if scale_after_normalization: all_grads = sess.run([dx, dm, dv, db, dg, odx, odm, odv, odb, odg]) to_check = ["dx", "dm", "dv", "db", "dg"] else: all_grads = sess.run([dx, dm, dv, db, odx, odm, odv, odb]) to_check = ["dx", "dm", "dv", "db"] for i, n in enumerate(to_check): print(n) self.assertAllClose(all_grads[i + len(to_check)], all_grads[i], atol=0.000001)
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ params = nest.flatten(params) with backend.get_graph().as_default(): grads = gradients.gradients(loss, params) for grad, param in zip(grads, params): if grad is None: raise ValueError("Variable {} has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.".format(param)) if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def testThatBackpropRuns(self): """Run optimization to ensure that gradients can be computed.""" batch_size = 1 image_height = 9 image_width = 12 image = variables.Variable( np.float32( np.random.uniform(size=[batch_size, image_height, image_width, 3]))) control_point_locations = [[3., 3.]] control_point_locations = constant_op.constant( np.float32(np.expand_dims(control_point_locations, 0))) control_point_displacements = [[0.25, -0.5]] control_point_displacements = constant_op.constant( np.float32(np.expand_dims(control_point_displacements, 0))) warped_image, _ = sparse_image_warp.sparse_image_warp( image, control_point_locations, control_point_locations + control_point_displacements, num_boundary_points=3) loss = math_ops.reduce_mean(math_ops.abs(warped_image - image)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [image]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [image])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(5): sess.run([loss, opt_func])
def _compute_gradients(tensor, var_list): grads = gradients.gradients(tensor, var_list) # tf.gradients sometimes returns `None` when it should return 0. return [ grad if grad is not None else array_ops.zeros_like(var) for var, grad in zip(var_list, grads) ]
def _get_train_ops(self, features, targets): """See base class.""" global_step = contrib_variables.get_global_step() assert global_step logits = self._logits(features, is_training=True) if self._enable_centered_bias: centered_bias_step = [self._centered_bias_step(targets, features)] else: centered_bias_step = [] with ops.control_dependencies(centered_bias_step): loss = self._loss(logits, targets, features) logging_ops.scalar_summary("loss", loss) linear_vars = self._get_linear_vars() dnn_vars = self._get_dnn_vars() grads = gradients.gradients(loss, dnn_vars + linear_vars) if self._gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, self._gradient_clip_norm) dnn_grads = grads[0 : len(dnn_vars)] linear_grads = grads[len(dnn_vars) :] train_ops = self._get_linear_training_ops(linear_grads, linear_vars) + self._get_dnn_training_ops( dnn_grads, dnn_vars ) train_step = control_flow_ops.group(*train_ops, name="combined_training_op") with ops.control_dependencies([train_step]): with ops.get_default_graph().colocate_with(global_step): return state_ops.assign_add(global_step, 1).op, loss
def testAggregateGradients(self): def fn(x): ind1 = tensor.Tensor(np.array([0, 1])) ind2 = tensor.Tensor(np.array([2, 3])) ind3 = tensor.Tensor(np.array([1, 3])) # A mixture of IndexedSlices and dense tensor to aggregate. g1 = embedding_ops.embedding_lookup(x, ind1) g2 = embedding_ops.embedding_lookup(x, ind2) g3 = embedding_ops.embedding_lookup(x, ind3) g4 = math_ops.reduce_sum(x * tensor.Tensor(2.0)) return g1 * g2 * g3 * g4 var_np = np.random.rand(4, 2).astype(np.float32) var = tensor.Tensor(var_np) grad = backprop.gradients_function(fn, [0])(var)[0] with context.graph_mode(), self.test_session(): tf_var = array_ops.constant(var_np, dtypes.float32) tf_ind1 = array_ops.constant([0, 1]) tf_ind2 = array_ops.constant([2, 3]) tf_ind3 = array_ops.constant([1, 3]) tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1) tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2) tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3) tf_g4 = math_ops.reduce_sum(tf_var * 2.0, reduction_indices=(0, 1)) tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4 tf_grad = gradients.gradients(tf_y, [tf_var])[0] tf_dense_grad = math_ops.unsorted_segment_sum( tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0]) self.assertAllClose(grad.numpy(), tf_dense_grad.eval())
def test_zero_grad_tf_gradients(self): if context.executing_eagerly(): self.skipTest("tf.gradients not supported in eager.") x = constant_op.constant([-1., 0., 1.]) g = self.evaluate(gradients.gradients(math_ops.pow(x, 2), x)[0]) self.assertAllClose([-2., 0., 2.], g)
def _Gradient(tensors, devices): inputs = [array_ops.placeholder(t.dtype, t.shape) for t in tensors] reduce_tensors = nccl_reduce(inputs, devices) losses = _DeviceTensors(tensors, [t.device for t in reduce_tensors]) grads = gradients.gradients( reduce_tensors, inputs, losses, colocate_gradients_with_ops=True) return [g for g in grads if g is not None]
def test_interpolation_gradient(self): """Make sure that backprop can run. Correctness of gradients is assumed. Here, we create a use a small 'training' set and a more densely-sampled set of query points, for which we know the true value in advance. The goal is to choose x locations for the training data such that interpolating using this training data yields the best reconstruction for the function values at the query points. The training data locations are optimized iteratively using gradient descent. """ tp = _QuadraticPlusSinProblemND() (query_points, query_values, train_points, train_values) = tp.get_problem(optimizable=True) regularization = 0.001 for interpolation_order in (1, 2, 3, 4): interpolator = interpolate_spline.interpolate_spline( train_points, train_values, query_points, interpolation_order, regularization) loss = math_ops.reduce_mean(math_ops.square(query_values - interpolator)) optimizer = momentum.MomentumOptimizer(0.001, 0.9) grad = gradients.gradients(loss, [train_points]) grad, _ = clip_ops.clip_by_global_norm(grad, 1.0) opt_func = optimizer.apply_gradients(zip(grad, [train_points])) init_op = variables.global_variables_initializer() with self.cached_session() as sess: sess.run(init_op) for _ in range(100): sess.run([loss, opt_func])
def get_gradients(self, loss, params): """Returns gradients of `loss` with respect to `params`. Arguments: loss: Loss tensor. params: List of variables. Returns: List of gradient tensors. Raises: ValueError: In case any gradient cannot be computed (e.g. if gradient function not implemented). """ loss = self._scale_loss(loss) grads = gradients.gradients(loss, params) if None in grads: raise ValueError("An operation has `None` for gradient. " "Please make sure that all of your ops have a " "gradient defined (i.e. are differentiable). " "Common ops without gradient: " "K.argmax, K.round, K.eval.") if hasattr(self, "clipnorm"): grads = [clip_ops.clip_by_norm(g, self.clipnorm) for g in grads] if hasattr(self, "clipvalue"): grads = [ clip_ops.clip_by_value(g, -self.clipvalue, self.clipvalue) for g in grads ] return grads
def testPlaysNicelyWithDefunSeparateGradientScope(self): with self.test_session(graph=ops.Graph()) as sess: with jit.experimental_jit_scope(True): @function.Defun( compiled=True, noinline=True, separate_compiled_gradients=True) def mulop(x1, x2): return x1 * x2 x = constant_op.constant(1.0) r = mulop(x, x) g_r = gradients.gradients(r, x, name="GA")[0] # Ensure the forward function is compiled. graph_def = r.graph.as_graph_def() func_attrs = graph_def.library.function[0].attr self.assertTrue(func_attrs["_XlaCompile"].b) self.assertEqual(b"jit_scope_0", func_attrs["_XlaScope"].s) # Ensure the gradient (SymbolicGradient) is compiled, with a different # _XlaScope from the function itself. grad_op = g_r.op.inputs[0].op self.assertTrue(grad_op.get_attr("_XlaCompile")) self.assertEqual(b"jit_scope_0_grad_GA", grad_op.get_attr("_XlaScope")) # Ensure the ops run: grad(x1*x1) = 2*x1 self.assertAllClose([1.0, 1.0, 2.0], sess.run([x, r, g_r]))
def testColocateGradientsWithAggregation(self): with ops.Graph().as_default() as g: with g.device("/device:GPU:1"): w = constant(1.0, shape=[1, 1]) x = constant(1.0, shape=[1, 2]) y = constant(1.0, shape=[1, 2]) wx = math_ops.matmul(w, x) wy = math_ops.matmul(w, y) with g.device("/device:GPU:0"): z = wx + wy gw1 = gradients.gradients(z, [w], colocate_gradients_with_ops=True)[0] self.assertEqual(gw1.op.colocation_groups(), wx.op.colocation_groups()) gw2 = gradients.gradients(z, [w], colocate_gradients_with_ops=False)[0] self.assertTrue(wx.op.colocation_groups() != gw2.op.colocation_groups())
def test_tensor_array_grad(self): inp = constant_op.constant(np.random.rand(3, 4, 2), dtype=dtypes.float32) ta = tensor_array_ops.TensorArray(dtypes.float32, size=3) ta = ta.unstack(inp) def loop_fn(i): def body(j, x): value = ta.gather([j]) value = array_ops.gather(array_ops.reshape(value, [4, 2]), i) return j + 1, x + value _, out = control_flow_ops.while_loop(lambda j, _: j < 3, body, (0, array_ops.zeros([2]))) out = math_ops.reduce_prod(out) return out, gradient_ops.gradients(out, inp)[0] pfor_out, pfor_out_grad = pfor_control_flow_ops.pfor(loop_fn, 4) # Note that tf.while_loop does not work in the setup above. So we manually # construct the equivalent computation of the above loops here. real_out = math_ops.reduce_sum(inp, axis=[0]) real_out = math_ops.reduce_prod(real_out, axis=[1]) # Note that gradients of real_out will accumulate the gradients across the # output value. Hence we do the same aggregation on pfor_out_grad. real_out_grad = gradient_ops.gradients(real_out, inp)[0] sum_pfor_out_grad = math_ops.reduce_sum(pfor_out_grad, axis=[0]) with session.Session() as sess: v1, v2, v1_grad, v2_grad = sess.run( [pfor_out, real_out, sum_pfor_out_grad, real_out_grad]) self.assertAllClose(v1, v2) self.assertAllClose(v1_grad, v2_grad)
def approximate_hessian(self, grads_and_vars, name=None): """ I haven't tested this yet so I have no idea if it works, but even if it does it's probably super slow, and either way nothing else has been modified to deal with it. """ gv = 0 var_refs = [] for g_t, x_tm1 in grads_and_vars: var_refs.append(x_tm1.ref()) if g_t is None: continue with ops.name_scope('update_' + x_tm1.op.name), ops.device(x_tm1.device): if isinstance(g_t, ops.Tensor): gv += math_ops.reduce_sum(g_t * random_ops.random_normal(g_t.get_shape())) else: idxs, idxs_ = array_ops.unique(g_t.indices) g_t_ = math_ops.unsorted_segment_sum(g_t.values, idxs_, array_ops.size(idxs)) gv += math_ops.reduce_sum(g_t_ * random_ops.random_normal(g_t_.get_shape())) hesses = gradients.gradients(gv, var_refs, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) return zip([g_t for g_t, _ in grads_and_vars], [x_tm1 for _, x_tm1 in grads_and_vars], hesses)
def loop_fn(i): image = array_ops.gather(images, i) label = array_ops.gather(labels, i) logits = array_ops.reshape(model(image, training=training), [-1]) loss = losses.softmax_cross_entropy( logits=logits, onehot_labels=label, reduction=losses.Reduction.NONE) return gradient_ops.gradients(loss, variables.trainable_variables())
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False): """""" # Error checking if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " + "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if var_list is None: var_list = variables.trainable_variables() for x_tm1 in var_list: if not isinstance(x_tm1, variables.Variable): raise TypeError("Argument is not a tf.Variable: %s" % x_tm1) if not var_list: raise ValueError("No variables to optimize") # The actual stuff var_refs = [x_tm1.ref() for x_tm1 in var_list] grads = gradients.gradients(loss, var_refs, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([x_tm1 for g_t, x_tm1 in grads_and_vars if g_t is not None]) return grads_and_vars
def testAggregateGradients(self): def fn(x): ind1 = constant_op.constant(np.array([0, 1])) ind2 = constant_op.constant(np.array([2, 3])) ind3 = constant_op.constant(np.array([1, 3])) # A mixture of IndexedSlices and dense tensor to aggregate. g1 = embedding_ops.embedding_lookup(x, ind1) g2 = embedding_ops.embedding_lookup(x, ind2) g3 = embedding_ops.embedding_lookup(x, ind3) g4 = math_ops.reduce_sum(x * constant_op.constant(2.0)) return g1 * g2 * g3 * g4 var_np = np.random.rand(4, 2).astype(np.float32) var = constant_op.constant(var_np) grad = backprop.gradients_function(fn, [0])(var)[0] grad = self.evaluate(ops.convert_to_tensor(grad)) if not context.executing_eagerly(): tf_var = array_ops.constant(var_np, dtypes.float32) tf_ind1 = array_ops.constant([0, 1]) tf_ind2 = array_ops.constant([2, 3]) tf_ind3 = array_ops.constant([1, 3]) tf_g1 = embedding_ops.embedding_lookup(tf_var, tf_ind1) tf_g2 = embedding_ops.embedding_lookup(tf_var, tf_ind2) tf_g3 = embedding_ops.embedding_lookup(tf_var, tf_ind3) tf_g4 = math_ops.reduce_sum(tf_var * 2.0, axis=(0, 1)) tf_y = tf_g1 * tf_g2 * tf_g3 * tf_g4 tf_grad = gradients.gradients(tf_y, [tf_var])[0] tf_dense_grad = math_ops.unsorted_segment_sum( tf_grad.values, tf_grad.indices, tf_grad.dense_shape[0]) self.assertAllClose(grad, self.evaluate(tf_dense_grad))
def testUnconnectedGradientsNoneUnconnectedGradients(self): with ops.Graph().as_default(): x = constant(1.0, shape=[2, 2]) y = constant(3.0, shape=[3, 1]) grad = gradients.gradients( [y], [x], unconnected_gradients="none") self.assertIsNone(grad[0])
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH]: raise ValueError("gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def test_gradients_exist(self): """Check that backprop can run. The correctness of the gradients is assumed, since the forward propagation is tested to be correct and we only use built-in tf ops. However, we perform a simple test to make sure that backprop can actually run. We treat the flows as a tf.Variable and optimize them to minimize the difference between the interpolated image and the input image. """ batch_size, height, width, numchannels = [4, 5, 6, 7] image_shape = [batch_size, height, width, numchannels] image = random_ops.random_normal(image_shape) flow_shape = [batch_size, height, width, 2] init_flows = np.float32(np.random.normal(size=flow_shape) * 0.25) flows = variables.Variable(init_flows) interp = dense_image_warp.dense_image_warp(image, flows) loss = math_ops.reduce_mean(math_ops.square(interp - image)) optimizer = adam.AdamOptimizer(1.0) grad = gradients.gradients(loss, [flows]) opt_func = optimizer.apply_gradients(zip(grad, [flows])) init_op = variables.global_variables_initializer() with self.test_session() as sess: sess.run(init_op) for _ in range(10): sess.run(opt_func)
def testCustomGradientWithVariables(self): @custom_gradient.custom_gradient def F(x): out = core_layers.dense(x, 3, use_bias=False) def Grad(out_grad, variables=None): # pylint: disable=redefined-outer-name self.assertEqual(1, len(variables)) grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad) return grads[0], [array_ops.ones((4, 3))] return out, Grad with ops.Graph().as_default(): x = array_ops.ones((2, 4)) with variable_scope.variable_scope("f", use_resource=True) as vs: y = F(x) all_vars = vs.global_variables() assert len(all_vars) == 1 grads = gradients.gradients(y, [x, all_vars[0]]) for g in grads: self.assertTrue(g is not None) with session.Session() as sess: sess.run(variables.global_variables_initializer()) dw = sess.run(math_ops.reduce_sum(grads[1])) self.assertEqual(12., dw)
def compute_gradients(self, loss, var_list, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None, stop_gradients=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize. When eager execution is enabled it must be a callable. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. stop_gradients: Optional. A Tensor or list of tensors not to differentiate through. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid, or var_list is None. RuntimeError: If called with eager execution enabled and `loss` is not callable. @compatibility(eager) When eager execution is enabled, `aggregation_method`, and `colocate_gradients_with_ops` are ignored. @end_compatibility """ var_list = nest.flatten(var_list) # TODO(josh11b): Test that we handle weight decay in a reasonable way. if callable(loss): with backprop.GradientTape() as tape: tape.watch(var_list) loss_value = loss() grads = tape.gradient(loss_value, var_list, grad_loss) else: if context.executing_eagerly(): raise RuntimeError("`loss` passed to Optimizer.compute_gradients " "should be a function when eager execution is " "enabled.") self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) grads = gradients.gradients( loss, var_list, grad_ys=grad_loss, aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops, stop_gradients=stop_gradients) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def _linear_classifier_model_fn(features, targets, mode, params): """Linear classifier model_fn. Args: features: `Tensor` or dict of `Tensor` (depends on data passed to `fit`). targets: `Tensor` of shape [batch_size, 1] or [batch_size] target labels of dtype `int32` or `int64` in the range `[0, n_classes)`. mode: Defines whether this is training, evaluation or prediction. See `ModeKeys`. params: A dict of hyperparameters. The following hyperparameters are expected: * feature_columns: An iterable containing all the feature columns used by the model. * n_classes: number of target classes. * weight_column_name: A string defining the weight feature column, or None if there are no weights. * optimizer: string, `Optimizer` object, or callable that defines the optimizer to use for training. * gradient_clip_norm: A float > 0. If provided, gradients are clipped to their global norm with this clipping ratio. * enable_centered_bias: A bool. If True, estimator will learn a centered bias variable for each class. Rest of the model structure learns the residual after centered bias. * num_ps_replicas: The number of parameter server replicas. * joint_weights: If True, the weights for all columns will be stored in a single (possibly partitioned) variable. It's more efficient, but it's incompatible with SDCAOptimizer, and requires all feature columns are sparse and use the 'sum' combiner. Returns: predictions: A dict of `Tensor` objects. loss: A scalar containing the loss of the step. train_op: The op for training. Raises: ValueError: If mode is not any of the `ModeKeys`. """ feature_columns = params["feature_columns"] n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) joint_weights = params.get("joint_weights", False) if not isinstance(features, dict): features = {"": features} parent_scope = "linear" num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope(features.values(), parent_scope, partitioner=partitioner) as scope: if joint_weights: logits, _, _ = (layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) else: logits, _, _ = (layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=[parent_scope], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape(math_ops.to_float( features[weight_column_name]), shape=(-1, )) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") logging_ops.scalar_summary("loss", loss) train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append( optimizer.apply_gradients(zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def loop_fn(i): out_i = array_ops.gather(out, i) return gradient_ops.gradients(out_i, x)[0]
def testVariableAsGraphElementGradient(self): with ops.Graph().as_default() as graph: init = constant_op.constant(100.0) var = variables.Variable(init) gradient = gradients.gradients(graph.as_graph_element(var), var) self.assertIsNotNone(gradient)
def _linear_classifier_model_fn(features, targets, mode, params): """Estimator's linear model_fn.""" n_classes = params["n_classes"] weight_column_name = params["weight_column_name"] feature_columns = params["feature_columns"] optimizer = params["optimizer"] gradient_clip_norm = params.get("gradient_clip_norm", None) enable_centered_bias = params.get("enable_centered_bias", True) num_ps_replicas = params.get("num_ps_replicas", 0) joint_weights = params.get("joint_weights", False) if not isinstance(features, dict): features = {"": features} num_label_columns = 1 if n_classes == 2 else n_classes loss_fn = _softmax_cross_entropy_loss if n_classes == 2: loss_fn = _log_loss_with_two_classes feat_values = (features.values() if isinstance(features, dict) else [features]) partitioner = partitioned_variables.min_max_variable_partitioner( max_partitions=num_ps_replicas, min_slice_size=64 << 20) with variable_scope.variable_op_scope(feat_values, "linear", partitioner=partitioner) as scope: if joint_weights: logits, _, _ = (layers.joint_weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=["linear"], scope=scope)) else: logits, _, _ = (layers.weighted_sum_from_feature_columns( columns_to_tensors=features, feature_columns=feature_columns, num_outputs=num_label_columns, weight_collections=["linear"], scope=scope)) if enable_centered_bias: logits = nn.bias_add(logits, _centered_bias(num_label_columns)) loss = None if mode != estimator.ModeKeys.INFER: loss = loss_fn(logits, targets) if weight_column_name: weight_tensor = array_ops.reshape(math_ops.to_float( features[weight_column_name]), shape=(-1, )) loss = _weighted_loss(loss, weight_tensor) else: loss = math_ops.reduce_mean(loss, name="loss") logging_ops.scalar_summary("loss", loss) train_ops = [] if mode == estimator.ModeKeys.TRAIN: global_step = contrib_variables.get_global_step() my_vars = ops.get_collection("linear") grads = gradients.gradients(loss, my_vars) if gradient_clip_norm: grads, _ = clip_ops.clip_by_global_norm(grads, gradient_clip_norm) train_ops.append( optimizer.apply_gradients(zip(grads, my_vars), global_step=global_step)) if enable_centered_bias: train_ops.append( _centered_bias_step(targets, loss_fn, num_label_columns)) predictions = {} if n_classes == 2: predictions[_LOGISTIC] = math_ops.sigmoid(logits) logits = array_ops.concat(1, [array_ops.zeros_like(logits), logits]) predictions[_PROBABILITIES] = nn.softmax(logits) predictions[_CLASSES] = math_ops.argmax(logits, 1) return predictions, loss, control_flow_ops.group(*train_ops)
def model_fn(inps, init_state): state = init_state for inp in inps: _, state = cell(inp, state) output = nn.l2_loss(state.c) return gradient_ops.gradients(output, variables.trainable_variables())
def _TestCompareFoldAndUnfolded(self, relu, relu_op_name, with_bypass, has_scaling, fused_batch_norm, freeze_batch_norm_delay): """Tests that running folded and unfolded BN returns the same results. Args: relu: Callable that returns an Operation, a factory method for the Relu*. relu_op_name: String, name of the Relu* operation. with_bypass: Bool, when true there is an extra connection added from inputs to just before Relu*. has_scaling: Bool, when true the batch norm has scaling. fused_batch_norm: Bool, when true the batch norm is fused. freeze_batch_norm_delay: None or the number of steps after which training switches to using frozen mean and variance """ random_seed.set_random_seed(1234) unfolded_g = ops.Graph() with unfolded_g.as_default(): batch_size, height, width = 5, 128, 128 inputs = random_ops.random_uniform((batch_size, height, width, 3), dtype=dtypes.float32, seed=1234) out_depth = 3 if with_bypass else 32 stride = 1 if with_bypass else 2 activation_fn = None if with_bypass else relu scope = 'test/test2' if with_bypass else 'test' node = conv2d(inputs, out_depth, [5, 5], stride=stride, padding='SAME', weights_initializer=self._WeightInit(0.09), activation_fn=activation_fn, normalizer_fn=batch_norm, normalizer_params=self._BatchNormParams( scale=has_scaling, fused=fused_batch_norm), scope=scope) if with_bypass: node = math_ops.add(inputs, node, name='test/Add') relu_node = relu(node, name='test/' + relu_op_name) folded_g = self._CopyGraph(unfolded_g) with folded_g.as_default(): fold_batch_norms.FoldBatchNorms( folded_g, is_training=True, freeze_batch_norm_delay=freeze_batch_norm_delay) with session.Session(graph=unfolded_g) as sess: sess.run(variables.global_variables_initializer()) grad_node = gradients.gradients(relu_node, inputs) results = sess.run([relu_node, grad_node]) unfolded_forward, unfolded_backward = results[0], results[1] with session.Session(graph=folded_g) as sess: sess.run(variables.global_variables_initializer()) relu_node = folded_g.get_tensor_by_name(relu_node.name) inputs = folded_g.get_tensor_by_name(inputs.name) grad_node = gradients.gradients(relu_node, inputs) results = sess.run([relu_node, grad_node]) folded_forward, folded_backward = results[0], results[1] # Check that the folded and unfolded results match. self.assertAllClose(unfolded_forward, folded_forward, atol=1e-3) self.assertAllClose(unfolded_backward, folded_backward, atol=1e-3)
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize or a callable taking no arguments which returns the value to minimize. When eager execution is enabled it must be a callable. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and `loss` is not callable. @compatibility(eager) When eager execution is enabled, `gate_gradients`, `aggregation_method`, and `colocate_gradients_with_ops` are ignored. @end_compatibility """ if callable(loss): with backprop.GradientTape() as tape: if var_list is not None: tape.watch(var_list) loss_value = loss() if var_list is None: var_list = tape.watched_variables() grads = tape.gradient(loss_value, var_list, grad_loss) return list(zip(grads, var_list)) if context.in_eager_mode(): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def loop_fn(i): y_i = array_ops.gather(y, i) grad = gradient_ops.gradients(y_i, x)[0] return array_ops.gather(grad, i)
def loop_fn(i): out_i = array_ops.gather(out, i, axis=1) return array_ops.reshape(gradient_ops.gradients(out_i, x)[0], [-1])
def _get_fx(self, f, i, x): if isinstance(f, list): return f[0], f[1] fx = f(i, x) grad = gradients.gradients(fx, x)[0] return fx, grad
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKey.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. """ if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes( [v for g, v in grads_and_vars if g is not None]) return grads_and_vars
def _compute_gradients(tensor, var_list): grads = gradients.gradients(tensor, var_list) # tf.gradients sometimes returns `None` when it should return 0. return [grad if grad is not None else array_ops.zeros_like(var) for var, grad in zip(var_list, grads)]
def loop_fn(i): a = array_ops.gather(x, i) y = nn.bias_add(a, bias, data_format=data_format) loss = math_ops.reduce_sum(y * y) return y, gradient_ops.gradients(loss, bias)
def model_fn(activation): for layer in layers: activation = layer(activation) activation = projection(activation) activation = nn.l2_loss(activation) return gradient_ops.gradients(activation, variables.trainable_variables())
def testPreventGradient(self): with ops.Graph().as_default(): inp = constant(1.0, shape=[100, 32], name="in") out = array_ops.prevent_gradient(inp) with self.assertRaisesRegexp(LookupError, "explicitly disabled"): _ = gradients.gradients(out, inp)
# g = tf.GradientTape().__enter__() # model.variables # model.trainable_weights[0] # g.watch(model.variables) # tf.GradientTape().__exit__() # model.evaluate() # model.layers[0].layers[6].updates model.optimizer.get_gradients # thing.updates from tensorflow.python.ops import gradients #alternatively try pytorch and cross validate with TF results #aaah apparently there's no way, but GradientTape.jacobian does have it, it just doesnt support tf.conds!!! adkjshja grads = [gradients.gradients(model2.output[i],model2.variables) for i in range(25)] grads = [gradients.gradients(model(train_images[i:i+1]),model.variables) for i in range(25)] sess = tf.Session() sess.run(grads, feed_dict={model2.input:train_images[:25]}) grads = gradients.gradients(tf.expand_dims(tf.tile(model(train_images[:25]),tf.constant([1,25], tf.int32)),0),model.variables, grad_ys=tf.eye(25)) grads grads[0].shape tf.app.flags.DEFINE_string('f', '', 'kernel') model.build()
def Grad(out_grad, variables=None): # pylint: disable=redefined-outer-name self.assertEqual(1, len(variables)) grads = gradients.gradients(out, [x, variables[0]], grad_ys=out_grad) return grads[0], [array_ops.ones((4, 3))]
def testStopGradient(self): with ops.Graph().as_default(): inp = constant(1.0, shape=[100, 32], name="in") out = array_ops.stop_gradient(inp) igrad = gradients.gradients(out, inp)[0] assert igrad is None
def loop_fn(i): x1 = array_ops.gather(x, i) output = nn.max_pool( x1, ksize, strides=[1, 2, 2, 1], padding="VALID", data_format="NHWC") loss = nn.l2_loss(output) return output, gradient_ops.gradients(loss, x1)
def loop_fn(i): logits_i = array_ops.gather(logits, i) labels_i = array_ops.gather(labels, i) loss = nn.softmax_cross_entropy_with_logits( labels=labels_i, logits=logits_i) return loss, gradient_ops.gradients(math_ops.reduce_sum(loss), logits_i)
def testUnconnectedGradientsNoneUnconnectedGradients(self): with ops.Graph().as_default(): x = constant(1.0, shape=[2, 2]) y = constant(3.0, shape=[3, 1]) grad = gradients.gradients([y], [x], unconnected_gradients="none") self.assertIsNone(grad[0])
def _xlogy_gradients(self, x, y): xlogy_xgrad = self.evaluate( gradients.gradients(math_ops.xlogy(x, y), x)[0]) xlogy_ygrad = self.evaluate( gradients.gradients(math_ops.xlogy(x, y), y)[0]) return xlogy_xgrad, xlogy_ygrad
def _compute_gradients(tensor, var_list): grads = gradients.gradients(tensor, var_list) return [ grad if grad is not None else array_ops.zeros_like(var) for var, grad in zip(var_list, grads) ]
def loop_fn(i): y = array_ops.gather(output, i, axis=1) return gradient_ops.gradients(y, inp)[0]
def _xdivy_gradients(self, x, y): xdivy_xgrad = self.evaluate( gradients.gradients(math_ops.xdivy(x, y), x)[0]) xdivy_ygrad = self.evaluate( gradients.gradients(math_ops.xdivy(x, y), y)[0]) return xdivy_xgrad, xdivy_ygrad
def testVariableRefGradient(self): with ops.Graph().as_default(): init = constant_op.constant(100.0) var = variables.Variable(init) gradient = gradients.gradients(var._ref(), var) self.assertIsNotNone(gradient)
def loop_fn(i): y = array_ops.gather(output, i) return gradient_ops.gradients(y, flat_inputs)
def _Gradients(ys, xs, **kwargs): dydxs = gradients.gradients(ys, xs, **kwargs) dydxs = [ 0. * x if dydx is None else dydx for x, dydx in zip(xs, dydxs) ] return dydxs
def loop_fn(i): x1 = array_ops.gather(x, i) y = op(x1) loss = math_ops.reduce_sum(y * y) return op(x), y, gradient_ops.gradients(loss, x1)