def testDefunCanBeDifferentiatedTwice(self): v = resource_variable_ops.ResourceVariable(1.0) @function.defun def f(): return v * v self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0) # Ensure that v is watched again. self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
def testGradientOfGatherWithDefun(self): v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0]) def sum_gather(): return math_ops.reduce_sum(array_ops.gather(v, [1, 2])) grad_fn = backprop.implicit_grad(sum_gather) gradient = grad_fn() defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather)) defun_gradient = defun_grad_fn() self.assertEqual(len(gradient), len(defun_gradient)) gradient = gradient[0][0] defun_gradient = defun_gradient[0][0] self.assertAllEqual(gradient.values, defun_gradient.values) self.assertAllEqual(gradient.indices, defun_gradient.indices) self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
def testDefunDifferentiable(self): v = resource_variable_ops.ResourceVariable(1.0) @function.defun def f(): return v * v self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
def testUnconnectedNone(self): v = resource_variable_ops.ResourceVariable( 1.0, name='testUnconnectedNone') def f(): v.read_value() return constant_op.constant(1.0) self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
def _test_minimize_loss_graph(self, d, soft_placement=False, learning_rate=0.2): config = config_pb2.ConfigProto() config.allow_soft_placement = soft_placement config.gpu_options.per_process_gpu_memory_fraction = 0.3 with context.graph_mode(), \ ops.Graph().as_default(), \ self.test_session(config=config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss(x): # TODO(josh11b): What if this constant was instead a captured # value? Would it need to be a value that has been passed # through d.broadcast()? y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y grad_fn = backprop.implicit_grad(loss) def update(v, g): return v.assign_sub(learning_rate * g) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_tower(grad_fn, one) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): g = d.reduce( variable_scope.VariableAggregation.SUM, g, destinations=v) with ops.control_dependencies(d.update( v, update, g, grouped=False)): after_list.append(d.read_var(v)) return before_list, after_list before_out, after_out = step() variables.global_variables_initializer().run() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before)
def testVariableGradient(self): with self.test_scope(): v0 = resource_variable_ops.ResourceVariable(1.0) def f(): x = v0 * v0 return x grads = backprop.implicit_grad(f)() self.assertEqual(2., grads[0][0].numpy())
def testGradients(self): @graph_callable.graph_callable([]) def my_function(): v = variable_scope.get_variable( "v", initializer=init_ops.constant_initializer(3.), shape=()) return v * v grad_fn = backprop.implicit_grad(my_function) grads_and_vars = list(zip(*grad_fn())) self.assertAllEqual(6., grads_and_vars[0][0])
def testImplicitGradWithResourceVariable(self): x = resource_variable_ops.ResourceVariable(initial_value=tensor.Tensor(1.0), name='x') def fn(): tape.watch(x.handle) b = tensor.Tensor(2.0) c = math_ops.add(x.value(), b) return math_ops.add(c, tensor.Tensor(3.0)) grad = backprop.implicit_grad(fn)()[0][1] self.assertEqual(grad.numpy(), 1.0)
def _test_minimize_loss_graph(self, d, soft_placement=False, learning_rate=0.2): config = config_pb2.ConfigProto() config.allow_soft_placement = soft_placement config.gpu_options.per_process_gpu_memory_fraction = 0.3 with context.graph_mode(), \ ops.Graph().as_default(), \ self.cached_session(config=config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y grad_fn = backprop.implicit_grad(loss) def update(v, g): return v.assign_sub(learning_rate * g) one = constant_op.constant([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one,)) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): g = d.extended.reduce_to( reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g,), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list before_out, after_out = step() variables.global_variables_initializer().run() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before)
def testReturningNonTensorRaisesError(self): optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0) optimizer.apply_gradients = function.defun(optimizer.apply_gradients) v = resource_variable_ops.ResourceVariable(1.0) grad = backprop.implicit_grad(lambda v: v**2)(v) with self.assertRaisesRegexp(TypeError, '.*must return zero or more Tensors.*'): # TODO(akshayka): We might want to allow defun-ing Python functions # that return operations (and just execute the op instead of running it). optimizer.apply_gradients(grad)
def testGPUImplicitGrad(self): with context.device('gpu:0'): v = resource_variable_ops.ResourceVariable( constant_op.constant(1.0), name='v') def f(): with context.device('gpu:0'): return v.read_value() self.assertEqual( backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
def testEarlyGradAggregation(self): # Needs to be a list so mutations by the callback affect this function. add_n = [] def callback(op_type, unused_1, unused_2, unused_3, unused_4): if compat.as_bytes(op_type) == compat.as_bytes('AddN'): add_n.append(1) context.context().add_post_execution_callback(callback) v = resource_variable_ops.ResourceVariable(constant_op.constant(2.0), name='v') def fn(): outputs = [] for _ in range(20): outputs.append(v * constant_op.constant(2.0)) return math_ops.add_n(outputs) # By default the aggregation count is 2. _ = backprop.implicit_grad(fn)()[0][1] self.assertEqual(len(add_n), 2) del add_n[:] # Reduce the aggregation limit, cause the backprop to do some # early aggregation. # pylint: disable=protected-access old_cnt = imperative_grad._MIN_AGGREGATE_COUNT old_bytes = imperative_grad._MIN_AGGREGATE_BYTES imperative_grad._MIN_AGGREGATE_COUNT = 10 imperative_grad._MIN_AGGREGATE_BYTES = 1 _ = backprop.implicit_grad(fn)() self.assertEqual(len(add_n), 6) del add_n[:] # Aggregation is also limited by the memory. imperative_grad._MIN_AGGREGATE_BYTES = 10000 _ = backprop.implicit_grad(fn)() self.assertEqual(len(add_n), 2) imperative_grad._MIN_AGGREGATE_COUNT = old_cnt imperative_grad._MIN_AGGREGATE_BYTES = old_bytes # pylint: enable=protected-access context.context().clear_post_execution_callbacks()
def testDifferentShapesEager(self): # Checks that kernel caching does not cause sharing of temporary storage # across different input shapes when executing eagerly. with context.eager_mode(): with ops.device("gpu:0"): first_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 28])) second_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 100])) self.assertAllEqual([28, 100, 100], first_output.shape) self.assertAllEqual([28, 100, 100], second_output.shape) def _LossFunc(): first_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 28])) second_output, _ = cudnn_rnn.CudnnGRU(1, 100)( array_ops.zeros([28, 100, 100])) return (math_ops.reduce_sum(first_output) + math_ops.reduce_sum(second_output)) backprop.implicit_grad(_LossFunc)()
def testMultiValueConvertToTensor(self): x = resource_variable_ops.ResourceVariable( initial_value=array_ops.constant([1.0]), name='x') def fn(): a = math_ops.add(x.value(), 1.0) # Make sure convert_to_tensor works correctly with list of TensorNodes. b = array_ops.stack([a, a], axis=0) return math_ops.reduce_mean(b) grad = backprop.implicit_grad(fn)()[0][0] self.assertAllEqual([1.0], grad)
def testGradientTensorConversionWithDefun(self): three = resource_variable_ops.ResourceVariable(3.0, name='v') @def_function.function def f(x): return math_ops.add(x, three) def g(x): return f(x) g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0] self.assertAllEqual(g, 1.0)
def testImplicitGradWithResourceVariable(self): x = resource_variable_ops.ResourceVariable( initial_value=constant_op.constant(1.0), name='x') def fn(): b = constant_op.constant(2.0) c = math_ops.add(x.value(), b) return math_ops.add(c, constant_op.constant(3.0)) grads_and_vars = backprop.implicit_grad(fn)() self.assertAllEqual(grads_and_vars[0][0], 1.0) self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
def testGradientTensorConversionWithDefun(self): three = tensor.Tensor(3.0) @function.defun def f(x): return math_ops.add(x, three) def g(x): tape.watch(three) return f(x) g = backprop.implicit_grad(g)(tensor.Tensor(1.0))[0][1] self.assertEqual(g.numpy(), 1.0)
def step(self, inputs): with self._distribution.scope(): gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.call_for_each_tower( gradients_fn, inputs, run_concurrently=self._is_run_concurrently) # If threads use layers, then we need to run the first step sequentially, # so that layers.build() is not executed in parallel. Otherwise, multiple # sets of mirrored variables are going to be created. self._is_run_concurrently = True return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def testGPUImplicitGrad(self): if not context.context().num_gpus(): self.skipTest('No GPU found') with context.device('gpu:0'): v = resource_variable_ops.ResourceVariable( constant_op.constant(1.0), name='v') def f(): with context.device('gpu:0'): tape.watch_variable(v) return v.read_value() self.assertEqual(backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
def testMultiValueConvertToTensor(self): x = resource_variable_ops.ResourceVariable( initial_value=array_ops.constant([1.0]), name='x') def fn(): tape.watch_variable(x) a = math_ops.add(x.value(), 1.0) # Make sure convert_to_tensor works correctly with list of TensorNodes. b = array_ops.stack([a, a], axis=0) return math_ops.reduce_mean(b) grad = backprop.implicit_grad(fn)()[0][0] self.assertAllEqual([1.0], grad)
def testImplicitGradWithResourceVariable(self): x = resource_variable_ops.ResourceVariable( initial_value=constant_op.constant(1.0), name='x') def fn(): tape.watch_variable(x) b = constant_op.constant(2.0) c = math_ops.add(x.value(), b) return math_ops.add(c, constant_op.constant(3.0)) grads_and_vars = backprop.implicit_grad(fn)() self.assertAllEqual(grads_and_vars[0][0], 1.0) self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
def testImplicitGradOrdering(self): v0 = resource_variable_ops.ResourceVariable(1.0) v1 = resource_variable_ops.ResourceVariable(2.0) def f(): x = v1 * v1 y = v0 * v0 return x + y grads = backprop.implicit_grad(f)() ordered_variables = [x[1] for x in grads] self.assertTrue(ordered_variables[0] is v0) self.assertTrue(ordered_variables[1] is v1)
def testGradientTensorConversionWithDefun(self): three = resource_variable_ops.ResourceVariable(3.0) @function.defun def f(x): return math_ops.add(x, three) def g(x): tape.watch_variable(three) return f(x) g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0] self.assertAllEqual(g, 1.0)
def testGPUImplicitGrad(self): if not context.context().num_gpus(): self.skipTest('No GPU found') with context.device('gpu:0'): v = resource_variable_ops.ResourceVariable(tensor.Tensor(1.0), name='v') def f(): with context.device('gpu:0'): tape.watch(v.handle) return v.read_value() self.assertEqual( backprop.implicit_grad(f)()[0][1].as_cpu_tensor().numpy(), 1.0)
def testGradientTensorConversionWithDefun(self): three = resource_variable_ops.ResourceVariable(3.0) @function.defun def f(x): return math_ops.add(x, three) def g(x): tape.watch_variable(three) return f(x) g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0] self.assertEqual(g.numpy(), 1.0)
def step_fn(ctx, *inputs): """Function to run one iteration with one input.""" gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.call_for_each_replica( gradients_fn, args=(ctx, ) + inputs) # If threads use layers, then we need to run the first step # sequentially, so that layers.build() is not executed in parallel. # Otherwise, multiple sets of mirrored variables are going to be # created. return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def step_fn(ctx, *inputs): """Function to run one iteration with one input.""" gradients_fn = backprop.implicit_grad(self._loss_fn) gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn) grads_and_vars = self.distribution.call_for_each_replica( gradients_fn, args=(ctx,) + inputs) # If threads use layers, then we need to run the first step # sequentially, so that layers.build() is not executed in parallel. # Otherwise, multiple sets of mirrored variables are going to be # created. return self._optimizer._distributed_apply( # pylint: disable=protected-access self.distribution, grads_and_vars)
def _test_minimize_loss_eager(self, d): with d.scope(): l = core.Dense(1, use_bias=False) def loss(x): # TODO(josh11b): What if this constant was instead a captured # value? Would it need to be a value that has been passed # through d.broadcast()? y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.reduce( variable_scope.VariableAggregation.SUM, g, destinations=v) with ops.control_dependencies(d.update( v, update, g, grouped=False)): after_list.append(d.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def testGPUImplicitGrad(self): if not context.context().num_gpus(): self.skipTest('No GPU found') with context.device('gpu:0'): v = resource_variable_ops.ResourceVariable( constant_op.constant(1.0), name='v') def f(): with context.device('gpu:0'): tape.watch_variable(v) return v.read_value() self.assertEqual( backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
def _test_minimize_loss_eager(self, d): with d.scope(): kernel = create_variable_like_keras_layer( name="kernel", shape=(1, 1), dtype=dtypes.float32) def loss(x): y = array_ops.reshape( gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = array_ops.identity([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one,)) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.extended.reduce_to( reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g,), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def test_feature_column_dense_features_gradient(self): with context.eager_mode(): sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0), (2, 0)), values=(0, 1, 2), dense_shape=(3, 3)) # Create feature columns (categorical and embedding). categorical_column = fc.categorical_column_with_identity( key='a', num_buckets=3) embedding_dimension = 2 def _embedding_column_initializer(shape, dtype, partition_info=None): del shape # unused del dtype # unused del partition_info # unused embedding_values = ( (1, 0), # id 0 (0, 1), # id 1 (1, 1)) # id 2 return embedding_values embedding_column = fc.embedding_column( categorical_column, dimension=embedding_dimension, initializer=_embedding_column_initializer) dense_features = df.DenseFeatures([embedding_column]) features = {'a': sparse_input} def scale_matrix(): matrix = dense_features(features) return 2 * matrix # Sanity check: Verify that scale_matrix returns the correct output. self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix()) # Check that the returned gradient is correct. grad_function = backprop.implicit_grad(scale_matrix) grads_and_vars = grad_function() indexed_slice = grads_and_vars[0][0] gradient = grads_and_vars[0][0].values self.assertAllEqual([0, 1, 2], indexed_slice.indices) self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
def _test_minimize_loss_eager(self, d): with d.scope(): l = core.Dense(1, use_bias=False) def loss(x): y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a # common `implicit_grad` function and put it in DistributionStrategy. grad_fn = backprop.implicit_grad(loss) grad_fn = optimizer.get_filtered_grad_fn(grad_fn) def update(v, g): return v.assign_sub(0.2 * g) one = constant_op.constant([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one,)) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) # control_dependencies irrelevant but harmless in eager execution with ops.control_dependencies([fetched]): g = d.extended.reduce_to( reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g,), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list for i in range(10): b, a = step() if i == 0: before, = b # pylint: disable=unbalanced-tuple-unpacking after, = a # pylint: disable=unbalanced-tuple-unpacking error_before = abs(before.numpy() - 1) error_after = abs(after.numpy() - 1) # Error should go down self.assertLess(error_after, error_before)
def testImplicitGradientsCustomGradientAndCachedVariableValue(self): @custom_gradient.custom_gradient def my_square(x): result = math_ops.square(x) def grad(dr): return 2 * dr * x + 1 return result, grad x = resource_variable_ops.ResourceVariable(initial_value=3, name='X.' + self.id()) def f(): return my_square(x) g = backprop.implicit_grad(f) grads_and_vars = g() self.assertEqual(1, len(grads_and_vars)) grad, var = grads_and_vars[0] self.assertAllEqual(7, grad) self.assertAllEqual(x, var)
def testImplicitGradOverEmbeddingLookup(self): batch_size = 8 embedding_size = 512 vocab_size = 1000 lrn_rate = 0.1 random_init = random_ops.random_uniform([vocab_size, embedding_size]) x = array_ops.ones((batch_size), dtypes.int64) embedding = resource_variable_ops.ResourceVariable( initial_value=random_init, dtype=dtypes.float32, name='embedding') def f(): tape.watch_variable(embedding) embedded_x = embedding_ops.embedding_lookup(embedding, x) return constant_op.constant(1.0, dtypes.float32) - embedded_x grad = backprop.implicit_grad(f)()[0][0] opt = training.GradientDescentOptimizer(lrn_rate) with context.graph_mode(), self.test_session(): tf_x = array_ops.ones((batch_size), dtypes.int64) # TODO(ashankar,apassos): Change to ResourceVariable. tf_embedding = variables.Variable( random_init.numpy(), name='tf_embedding') tf_embedded_x = embedding_ops.embedding_lookup(tf_embedding, tf_x) tf_y = 1.0 - tf_embedded_x tf_grad = gradients.gradients(tf_y, [tf_embedding])[0] tf_opt = training.GradientDescentOptimizer(0.1) tf_embedding.initializer.run() self.assertAllClose(tf_grad.indices.eval(), grad.indices) self.assertAllClose(tf_grad.values.eval(), grad.values) tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run() expected = tf_embedding.eval() opt.apply_gradients([(grad, embedding)]) self.assertAllClose(expected, embedding.read_value())
def testImplicitGradOverEmbeddingLookup(self): batch_size = 8 embedding_size = 512 vocab_size = 1000 lrn_rate = 0.1 random_init = random_ops.random_uniform([vocab_size, embedding_size]) x = array_ops.ones((batch_size), dtypes.int64) embedding = resource_variable_ops.ResourceVariable( initial_value=random_init, dtype=dtypes.float32, name='embedding') def f(): tape.watch_variable(embedding) embedded_x = embedding_ops.embedding_lookup(embedding, x) return constant_op.constant(1.0, dtypes.float32) - embedded_x grad = backprop.implicit_grad(f)()[0][0] opt = training.GradientDescentOptimizer(lrn_rate) with context.graph_mode(), self.test_session(): tf_x = array_ops.ones((batch_size), dtypes.int64) # TODO(ashankar,apassos): Change to ResourceVariable. tf_embedding = variables.Variable(random_init.numpy(), name='tf_embedding') tf_embedded_x = embedding_ops.embedding_lookup(tf_embedding, tf_x) tf_y = 1.0 - tf_embedded_x tf_grad = gradients.gradients(tf_y, [tf_embedding])[0] tf_opt = training.GradientDescentOptimizer(0.1) tf_embedding.initializer.run() self.assertAllClose(tf_grad.indices.eval(), grad.indices) self.assertAllClose(tf_grad.values.eval(), grad.values) tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run() expected = tf_embedding.eval() opt.apply_gradients([(grad, embedding)]) self.assertAllClose(expected, embedding.read_value())
def testImplicitGradientsCustomGradientAndCachedVariableValue(self): @custom_gradient.custom_gradient def my_square(x): result = math_ops.square(x) def grad(dr): return 2 * dr * x + 1 return result, grad x = resource_variable_ops.ResourceVariable( initial_value=3., name='X.' + self.id()) def f(): return my_square(x) g = backprop.implicit_grad(f) grads_and_vars = g() self.assertEqual(1, len(grads_and_vars)) grad, var = grads_and_vars[0] self.assertAllEqual(7, grad) self.assertAllEqual(x, var)
def step(): def inner(): return v * v return backprop.implicit_grad(inner)()[0][0]
def step(): def inner(): tape.watch_variable(v) return v * v return backprop.implicit_grad(inner)()[0][0]
def step(): def inner(): tape.watch(v.handle) return v * v return backprop.implicit_grad(inner)()[0][1]
def _test_minimize_loss_graph(self, d, soft_placement=False, learning_rate=0.2): config = config_pb2.ConfigProto() config.allow_soft_placement = soft_placement config.gpu_options.per_process_gpu_memory_fraction = 0.3 with context.graph_mode(), \ ops.Graph().as_default(), \ self.cached_session(config=config) as sess, \ d.scope(): l = core.Dense(1, use_bias=False) def loss(x): # TODO(josh11b): What if this constant was instead a captured # value? Would it need to be a value that has been passed # through d.broadcast()? y = array_ops.reshape(l(x), []) - constant_op.constant(1.) return y * y grad_fn = backprop.implicit_grad(loss) def update(v, g): return v.assign_sub(learning_rate * g) one = d.broadcast(constant_op.constant([[1.]])) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g, ), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list before_out, after_out = step() variables.global_variables_initializer().run() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before)
def train(): grad = backprop.implicit_grad(loss)() optimizer.apply_gradients(grad)
def train(): v = resource_variable_ops.ResourceVariable(1.0) grad = backprop.implicit_grad(loss)(v) optimizer.apply_gradients(grad) return v.read_value()
def train(): self.v = resource_variable_ops.ResourceVariable(1.0) grad = backprop.implicit_grad(loss)(self.v) optimizer.apply_gradients(grad) return self.v.read_value()
def compute_gradients(self, loss, var_list=None, gate_gradients=GATE_OP, aggregation_method=None, colocate_gradients_with_ops=False, grad_loss=None): """Compute gradients of `loss` for the variables in `var_list`. This is the first part of `minimize()`. It returns a list of (gradient, variable) pairs where "gradient" is the gradient for "variable". Note that "gradient" can be a `Tensor`, an `IndexedSlices`, or `None` if there is no gradient for the given variable. Args: loss: A Tensor containing the value to minimize. var_list: Optional list or tuple of `tf.Variable` to update to minimize `loss`. Defaults to the list of variables collected in the graph under the key `GraphKeys.TRAINABLE_VARIABLES`. gate_gradients: How to gate the computation of gradients. Can be `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`. aggregation_method: Specifies the method used to combine gradient terms. Valid values are defined in the class `AggregationMethod`. colocate_gradients_with_ops: If True, try colocating gradients with the corresponding op. grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`. Returns: A list of (gradient, variable) pairs. Variable is always present, but gradient can be `None`. Raises: TypeError: If `var_list` contains anything else than `Variable` objects. ValueError: If some arguments are invalid. RuntimeError: If called with eager execution enabled and if `grad_loss` is not `None` or `loss` is not callable. @compatibility(eager) When eager execution is enabled, `loss` should be a Python function that takes elements of `var_list` as arguments and computes the value to be minimized. If `var_list` is None, `loss` should take no arguments. Gradient computation is done with respect to the elements of `var_list` if not None, else with respect to any trainable variables created during the execution of the `loss` function. `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and `grad_loss` are ignored when eager execution is enabled. @end_compatibility """ if context.in_eager_mode(): if grad_loss is not None: raise RuntimeError( "`grad_loss` argument to Optimizer.compute_gradients " "not supported when eager execution is enabled.") if not callable(loss): raise RuntimeError( "`loss` passed to Optimizer.compute_gradients should " "be a function when eager execution is enabled.") # TODO (agarwal): consider passing parameters to the `loss` function. id:2636 gh:2637 if var_list is None: return backprop.implicit_grad(loss)() else: var_list = nest.flatten(var_list) grads = backprop.gradients_function(loss)(*var_list) grads_and_vars = list(zip(grads, var_list)) return grads_and_vars if gate_gradients not in [ Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH ]: raise ValueError( "gate_gradients must be one of: Optimizer.GATE_NONE, " "Optimizer.GATE_OP, Optimizer.GATE_GRAPH. Not %s" % gate_gradients) self._assert_valid_dtypes([loss]) if grad_loss is not None: self._assert_valid_dtypes([grad_loss]) if var_list is None: var_list = ( variables.trainable_variables() + ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES)) else: var_list = nest.flatten(var_list) # pylint: disable=protected-access var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS) # pylint: enable=protected-access processors = [_get_processor(v) for v in var_list] if not var_list: raise ValueError("No variables to optimize.") var_refs = [p.target() for p in processors] grads = gradients.gradients( loss, var_refs, grad_ys=grad_loss, gate_gradients=(gate_gradients == Optimizer.GATE_OP), aggregation_method=aggregation_method, colocate_gradients_with_ops=colocate_gradients_with_ops) if gate_gradients == Optimizer.GATE_GRAPH: grads = control_flow_ops.tuple(grads) grads_and_vars = list(zip(grads, var_list)) self._assert_valid_dtypes([ v for g, v in grads_and_vars if g is not None and v.dtype != dtypes.resource ]) return grads_and_vars
def _test_minimize_loss_graph(self, d, soft_placement=False, learning_rate=0.2): config = config_pb2.ConfigProto() config.allow_soft_placement = soft_placement config.gpu_options.per_process_gpu_memory_fraction = 0.3 with context.graph_mode(), \ ops.Graph().as_default(), \ self.cached_session(config=config) as sess, \ d.scope(): kernel = create_variable_like_keras_layer(name="kernel", shape=(1, 1), dtype=dtypes.float32) def loss(x): y = array_ops.reshape(gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.) return y * y grad_fn = backprop.implicit_grad(loss) def update(v, g): return v.assign_sub(learning_rate * g) one = array_ops.identity([[1.]]) def step(): """Perform one optimization step.""" # Run forward & backward to get gradients, variables list. g_v = d.extended.call_for_each_replica(grad_fn, args=(one, )) # Update the variables using the gradients and the update() function. before_list = [] after_list = [] for g, v in g_v: fetched = d.extended.read_var(v) before_list.append(fetched) with ops.control_dependencies([fetched]): g = d.extended.reduce_to(reduce_util.ReduceOp.SUM, g, destinations=v) with ops.control_dependencies( d.extended.update(v, update, args=(g, ), group=False)): after_list.append(d.extended.read_var(v)) return before_list, after_list before_out, after_out = step() variables.global_variables_initializer().run() for i in range(10): b, a = sess.run((before_out, after_out)) if i == 0: before, = b after, = a error_before = abs(before - 1) error_after = abs(after - 1) # Error should go down self.assertLess(error_after, error_before)