def testDefunCanBeDifferentiatedTwice(self):
    v = resource_variable_ops.ResourceVariable(1.0)

    @function.defun
    def f():
      return v * v

    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
    # Ensure that v is watched again.
    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
  def testGradientOfGatherWithDefun(self):
    v = resource_variable_ops.ResourceVariable([0.0, 1.0, 2.0])

    def sum_gather():
      return math_ops.reduce_sum(array_ops.gather(v, [1, 2]))

    grad_fn = backprop.implicit_grad(sum_gather)
    gradient = grad_fn()
    defun_grad_fn = backprop.implicit_grad(function.defun(sum_gather))
    defun_gradient = defun_grad_fn()
    self.assertEqual(len(gradient), len(defun_gradient))

    gradient = gradient[0][0]
    defun_gradient = defun_gradient[0][0]
    self.assertAllEqual(gradient.values, defun_gradient.values)
    self.assertAllEqual(gradient.indices, defun_gradient.indices)
    self.assertAllEqual(gradient.dense_shape, defun_gradient.dense_shape)
  def testDefunDifferentiable(self):
    v = resource_variable_ops.ResourceVariable(1.0)

    @function.defun
    def f():
      return v * v

    self.assertAllEqual(backprop.implicit_grad(f)()[0][0], 2.0)
Exemple #4
0
  def testUnconnectedNone(self):
    v = resource_variable_ops.ResourceVariable(
        1.0, name='testUnconnectedNone')

    def f():
      v.read_value()
      return constant_op.constant(1.0)

    self.assertEqual(backprop.implicit_grad(f)()[0][0], None)
  def _test_minimize_loss_graph(self, d, soft_placement=False,
                                learning_rate=0.2):
    config = config_pb2.ConfigProto()
    config.allow_soft_placement = soft_placement
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    with context.graph_mode(), \
         ops.Graph().as_default(), \
         self.test_session(config=config) as sess, \
         d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        # TODO(josh11b): What if this constant was instead a captured
        # value?  Would it need to be a value that has been passed
        # through d.broadcast()?
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y

      grad_fn = backprop.implicit_grad(loss)

      def update(v, g):
        return v.assign_sub(learning_rate * g)

      one = d.broadcast(constant_op.constant([[1.]]))

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.call_for_each_tower(grad_fn, one)

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.read_var(v)
          before_list.append(fetched)
          with ops.control_dependencies([fetched]):
            g = d.reduce(
                variable_scope.VariableAggregation.SUM, g, destinations=v)
            with ops.control_dependencies(d.update(
                v, update, g, grouped=False)):
              after_list.append(d.read_var(v))
        return before_list, after_list

      before_out, after_out = step()
      variables.global_variables_initializer().run()
      for i in range(10):
        b, a = sess.run((before_out, after_out))
        if i == 0:
          before, = b
        after, = a

      error_before = abs(before - 1)
      error_after = abs(after - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
  def testVariableGradient(self):
    with self.test_scope():
      v0 = resource_variable_ops.ResourceVariable(1.0)

      def f():
        x = v0 * v0
        return x

      grads = backprop.implicit_grad(f)()
    self.assertEqual(2., grads[0][0].numpy())
  def testGradients(self):
    @graph_callable.graph_callable([])
    def my_function():
      v = variable_scope.get_variable(
          "v", initializer=init_ops.constant_initializer(3.), shape=())
      return v * v

    grad_fn = backprop.implicit_grad(my_function)
    grads_and_vars = list(zip(*grad_fn()))
    self.assertAllEqual(6., grads_and_vars[0][0])
 def testImplicitGradWithResourceVariable(self):
   x = resource_variable_ops.ResourceVariable(initial_value=tensor.Tensor(1.0),
                                              name='x')
   def fn():
     tape.watch(x.handle)
     b = tensor.Tensor(2.0)
     c = math_ops.add(x.value(), b)
     return math_ops.add(c, tensor.Tensor(3.0))
   grad = backprop.implicit_grad(fn)()[0][1]
   self.assertEqual(grad.numpy(), 1.0)
  def _test_minimize_loss_graph(self,
                                d,
                                soft_placement=False,
                                learning_rate=0.2):
    config = config_pb2.ConfigProto()
    config.allow_soft_placement = soft_placement
    config.gpu_options.per_process_gpu_memory_fraction = 0.3
    with context.graph_mode(), \
         ops.Graph().as_default(), \
         self.cached_session(config=config) as sess, \
         d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y

      grad_fn = backprop.implicit_grad(loss)

      def update(v, g):
        return v.assign_sub(learning_rate * g)

      one = constant_op.constant([[1.]])

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.extended.read_var(v)
          before_list.append(fetched)
          with ops.control_dependencies([fetched]):
            g = d.extended.reduce_to(
                reduce_util.ReduceOp.SUM, g, destinations=v)
            with ops.control_dependencies(
                d.extended.update(v, update, args=(g,), group=False)):
              after_list.append(d.extended.read_var(v))
        return before_list, after_list

      before_out, after_out = step()
      variables.global_variables_initializer().run()
      for i in range(10):
        b, a = sess.run((before_out, after_out))
        if i == 0:
          before, = b
        after, = a

      error_before = abs(before - 1)
      error_after = abs(after - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
  def testReturningNonTensorRaisesError(self):
    optimizer = momentum.MomentumOptimizer(learning_rate=1.0, momentum=1.0)
    optimizer.apply_gradients = function.defun(optimizer.apply_gradients)
    v = resource_variable_ops.ResourceVariable(1.0)
    grad = backprop.implicit_grad(lambda v: v**2)(v)

    with self.assertRaisesRegexp(TypeError,
                                 '.*must return zero or more Tensors.*'):
      # TODO(akshayka): We might want to allow defun-ing Python functions
      # that return operations (and just execute the op instead of running it).
      optimizer.apply_gradients(grad)
  def testGPUImplicitGrad(self):
    with context.device('gpu:0'):
      v = resource_variable_ops.ResourceVariable(
          constant_op.constant(1.0), name='v')

    def f():
      with context.device('gpu:0'):
        return v.read_value()

    self.assertEqual(
        backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
  def testEarlyGradAggregation(self):
    # Needs to be a list so mutations by the callback affect this function.
    add_n = []
    def callback(op_type, unused_1, unused_2, unused_3, unused_4):
      if compat.as_bytes(op_type) == compat.as_bytes('AddN'):
        add_n.append(1)
    context.context().add_post_execution_callback(callback)

    v = resource_variable_ops.ResourceVariable(constant_op.constant(2.0),
                                               name='v')
    def fn():
      outputs = []
      for _ in range(20):
        outputs.append(v * constant_op.constant(2.0))
      return math_ops.add_n(outputs)

    # By default the aggregation count is 2.
    _ = backprop.implicit_grad(fn)()[0][1]
    self.assertEqual(len(add_n), 2)
    del add_n[:]

    # Reduce the aggregation limit, cause the backprop to do some
    # early aggregation.
    # pylint: disable=protected-access
    old_cnt = imperative_grad._MIN_AGGREGATE_COUNT
    old_bytes = imperative_grad._MIN_AGGREGATE_BYTES
    imperative_grad._MIN_AGGREGATE_COUNT = 10
    imperative_grad._MIN_AGGREGATE_BYTES = 1
    _ = backprop.implicit_grad(fn)()
    self.assertEqual(len(add_n), 6)
    del add_n[:]

    # Aggregation is also limited by the memory.
    imperative_grad._MIN_AGGREGATE_BYTES = 10000
    _ = backprop.implicit_grad(fn)()
    self.assertEqual(len(add_n), 2)

    imperative_grad._MIN_AGGREGATE_COUNT = old_cnt
    imperative_grad._MIN_AGGREGATE_BYTES = old_bytes
    # pylint: enable=protected-access
    context.context().clear_post_execution_callbacks()
  def testDifferentShapesEager(self):
    # Checks that kernel caching does not cause sharing of temporary storage
    # across different input shapes when executing eagerly.
    with context.eager_mode():
      with ops.device("gpu:0"):
        first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
            array_ops.zeros([28, 100, 28]))
        second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
            array_ops.zeros([28, 100, 100]))
        self.assertAllEqual([28, 100, 100], first_output.shape)
        self.assertAllEqual([28, 100, 100], second_output.shape)

        def _LossFunc():
          first_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
              array_ops.zeros([28, 100, 28]))
          second_output, _ = cudnn_rnn.CudnnGRU(1, 100)(
              array_ops.zeros([28, 100, 100]))
          return (math_ops.reduce_sum(first_output) +
                  math_ops.reduce_sum(second_output))

        backprop.implicit_grad(_LossFunc)()
Exemple #14
0
  def testMultiValueConvertToTensor(self):
    x = resource_variable_ops.ResourceVariable(
        initial_value=array_ops.constant([1.0]), name='x')

    def fn():
      a = math_ops.add(x.value(), 1.0)
      # Make sure convert_to_tensor works correctly with list of TensorNodes.
      b = array_ops.stack([a, a], axis=0)
      return math_ops.reduce_mean(b)

    grad = backprop.implicit_grad(fn)()[0][0]
    self.assertAllEqual([1.0], grad)
  def testGradientTensorConversionWithDefun(self):
    three = resource_variable_ops.ResourceVariable(3.0, name='v')

    @def_function.function
    def f(x):
      return math_ops.add(x, three)

    def g(x):
      return f(x)

    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
    self.assertAllEqual(g, 1.0)
Exemple #16
0
  def testImplicitGradWithResourceVariable(self):
    x = resource_variable_ops.ResourceVariable(
        initial_value=constant_op.constant(1.0), name='x')

    def fn():
      b = constant_op.constant(2.0)
      c = math_ops.add(x.value(), b)
      return math_ops.add(c, constant_op.constant(3.0))

    grads_and_vars = backprop.implicit_grad(fn)()
    self.assertAllEqual(grads_and_vars[0][0], 1.0)
    self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
Exemple #17
0
    def testGradientTensorConversionWithDefun(self):
        three = tensor.Tensor(3.0)

        @function.defun
        def f(x):
            return math_ops.add(x, three)

        def g(x):
            tape.watch(three)
            return f(x)

        g = backprop.implicit_grad(g)(tensor.Tensor(1.0))[0][1]
        self.assertEqual(g.numpy(), 1.0)
Exemple #18
0
  def step(self, inputs):
    with self._distribution.scope():
      gradients_fn = backprop.implicit_grad(self._loss_fn)
      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

      grads_and_vars = self.distribution.call_for_each_tower(
          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
      # If threads use layers, then we need to run the first step sequentially,
      # so that layers.build() is not executed in parallel.  Otherwise, multiple
      # sets of mirrored variables are going to be created.
      self._is_run_concurrently = True
      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
          self.distribution, grads_and_vars)
Exemple #19
0
    def testGPUImplicitGrad(self):
        if not context.context().num_gpus():
            self.skipTest('No GPU found')
        with context.device('gpu:0'):
            v = resource_variable_ops.ResourceVariable(
                constant_op.constant(1.0), name='v')

        def f():
            with context.device('gpu:0'):
                tape.watch_variable(v)
                return v.read_value()

        self.assertEqual(backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
Exemple #20
0
    def testMultiValueConvertToTensor(self):
        x = resource_variable_ops.ResourceVariable(
            initial_value=array_ops.constant([1.0]), name='x')

        def fn():
            tape.watch_variable(x)
            a = math_ops.add(x.value(), 1.0)
            # Make sure convert_to_tensor works correctly with list of TensorNodes.
            b = array_ops.stack([a, a], axis=0)
            return math_ops.reduce_mean(b)

        grad = backprop.implicit_grad(fn)()[0][0]
        self.assertAllEqual([1.0], grad)
Exemple #21
0
    def testImplicitGradWithResourceVariable(self):
        x = resource_variable_ops.ResourceVariable(
            initial_value=constant_op.constant(1.0), name='x')

        def fn():
            tape.watch_variable(x)
            b = constant_op.constant(2.0)
            c = math_ops.add(x.value(), b)
            return math_ops.add(c, constant_op.constant(3.0))

        grads_and_vars = backprop.implicit_grad(fn)()
        self.assertAllEqual(grads_and_vars[0][0], 1.0)
        self.assertAllEqual(id(grads_and_vars[0][1]), id(x))
    def testImplicitGradOrdering(self):
        v0 = resource_variable_ops.ResourceVariable(1.0)
        v1 = resource_variable_ops.ResourceVariable(2.0)

        def f():
            x = v1 * v1
            y = v0 * v0
            return x + y

        grads = backprop.implicit_grad(f)()
        ordered_variables = [x[1] for x in grads]
        self.assertTrue(ordered_variables[0] is v0)
        self.assertTrue(ordered_variables[1] is v1)
  def testGradientTensorConversionWithDefun(self):
    three = resource_variable_ops.ResourceVariable(3.0)

    @function.defun
    def f(x):
      return math_ops.add(x, three)

    def g(x):
      tape.watch_variable(three)
      return f(x)

    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
    self.assertAllEqual(g, 1.0)
Exemple #24
0
  def testGPUImplicitGrad(self):
    if not context.context().num_gpus():
      self.skipTest('No GPU found')
    with context.device('gpu:0'):
      v = resource_variable_ops.ResourceVariable(tensor.Tensor(1.0), name='v')

    def f():
      with context.device('gpu:0'):
        tape.watch(v.handle)
        return v.read_value()

    self.assertEqual(
        backprop.implicit_grad(f)()[0][1].as_cpu_tensor().numpy(), 1.0)
  def testGradientTensorConversionWithDefun(self):
    three = resource_variable_ops.ResourceVariable(3.0)

    @function.defun
    def f(x):
      return math_ops.add(x, three)

    def g(x):
      tape.watch_variable(three)
      return f(x)

    g = backprop.implicit_grad(g)(constant_op.constant(1.0))[0][0]
    self.assertEqual(g.numpy(), 1.0)
Exemple #26
0
            def step_fn(ctx, *inputs):
                """Function to run one iteration with one input."""
                gradients_fn = backprop.implicit_grad(self._loss_fn)
                gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

                grads_and_vars = self.distribution.call_for_each_replica(
                    gradients_fn, args=(ctx, ) + inputs)
                # If threads use layers, then we need to run the first step
                # sequentially, so that layers.build() is not executed in parallel.
                # Otherwise, multiple sets of mirrored variables are going to be
                # created.
                return self._optimizer._distributed_apply(  # pylint: disable=protected-access
                    self.distribution, grads_and_vars)
Exemple #27
0
  def testGradientTensorConversionWithDefun(self):
    three = tensor.Tensor(3.0)

    @function.defun
    def f(x):
      return math_ops.add(x, three)

    def g(x):
      tape.watch(three)
      return f(x)

    g = backprop.implicit_grad(g)(tensor.Tensor(1.0))[0][1]
    self.assertEqual(g.numpy(), 1.0)
      def step_fn(ctx, *inputs):
        """Function to run one iteration with one input."""
        gradients_fn = backprop.implicit_grad(self._loss_fn)
        gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

        grads_and_vars = self.distribution.call_for_each_replica(
            gradients_fn, args=(ctx,) + inputs)
        # If threads use layers, then we need to run the first step
        # sequentially, so that layers.build() is not executed in parallel.
        # Otherwise, multiple sets of mirrored variables are going to be
        # created.
        return self._optimizer._distributed_apply(  # pylint: disable=protected-access
            self.distribution, grads_and_vars)
Exemple #29
0
  def testImplicitGradOrdering(self):
    v0 = resource_variable_ops.ResourceVariable(1.0)
    v1 = resource_variable_ops.ResourceVariable(2.0)

    def f():
      x = v1 * v1
      y = v0 * v0
      return x + y

    grads = backprop.implicit_grad(f)()
    ordered_variables = [x[1] for x in grads]
    self.assertTrue(ordered_variables[0] is v0)
    self.assertTrue(ordered_variables[1] is v1)
Exemple #30
0
  def _test_minimize_loss_eager(self, d):
    with d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        # TODO(josh11b): What if this constant was instead a captured
        # value?  Would it need to be a value that has been passed
        # through d.broadcast()?
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y
      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
      # common `implicit_grad` function and put it in DistributionStrategy.
      grad_fn = backprop.implicit_grad(loss)
      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

      def update(v, g):
        return v.assign_sub(0.2 * g)

      one = d.broadcast(constant_op.constant([[1.]]))

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.read_var(v)
          before_list.append(fetched)
          # control_dependencies irrelevant but harmless in eager execution
          with ops.control_dependencies([fetched]):
            g = d.reduce(
                variable_scope.VariableAggregation.SUM, g, destinations=v)
            with ops.control_dependencies(d.update(
                v, update, g, grouped=False)):
              after_list.append(d.read_var(v))
        return before_list, after_list

      for i in range(10):
        b, a = step()
        if i == 0:
          before, = b  # pylint: disable=unbalanced-tuple-unpacking
        after, = a  # pylint: disable=unbalanced-tuple-unpacking

      error_before = abs(before.numpy() - 1)
      error_after = abs(after.numpy() - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
Exemple #31
0
    def testGPUImplicitGrad(self):
        if not context.context().num_gpus():
            self.skipTest('No GPU found')
        with context.device('gpu:0'):
            v = resource_variable_ops.ResourceVariable(tensor.Tensor(1.0),
                                                       name='v')

        def f():
            with context.device('gpu:0'):
                tape.watch(v.handle)
                return v.read_value()

        self.assertEqual(
            backprop.implicit_grad(f)()[0][1].as_cpu_tensor().numpy(), 1.0)
  def testGPUImplicitGrad(self):
    if not context.context().num_gpus():
      self.skipTest('No GPU found')
    with context.device('gpu:0'):
      v = resource_variable_ops.ResourceVariable(
          constant_op.constant(1.0), name='v')

    def f():
      with context.device('gpu:0'):
        tape.watch_variable(v)
        return v.read_value()

    self.assertEqual(
        backprop.implicit_grad(f)()[0][0].cpu().numpy(), 1.0)
Exemple #33
0
    def step(self, inputs):
        with self._distribution.scope():
            gradients_fn = backprop.implicit_grad(self._loss_fn)
            gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

            grads_and_vars = self.distribution.call_for_each_tower(
                gradients_fn,
                inputs,
                run_concurrently=self._is_run_concurrently)
            # If threads use layers, then we need to run the first step sequentially,
            # so that layers.build() is not executed in parallel.  Otherwise, multiple
            # sets of mirrored variables are going to be created.
            self._is_run_concurrently = True
            return self._optimizer._distributed_apply(  # pylint: disable=protected-access
                self.distribution, grads_and_vars)
Exemple #34
0
  def _test_minimize_loss_eager(self, d):
    with d.scope():
      kernel = create_variable_like_keras_layer(
          name="kernel", shape=(1, 1), dtype=dtypes.float32)
      def loss(x):
        y = array_ops.reshape(
            gen_math_ops.mat_mul(x, kernel), []) - array_ops.identity(1.)
        return y * y
      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
      # common `implicit_grad` function and put it in DistributionStrategy.
      grad_fn = backprop.implicit_grad(loss)
      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

      def update(v, g):
        return v.assign_sub(0.2 * g)

      one = array_ops.identity([[1.]])

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.extended.read_var(v)
          before_list.append(fetched)
          # control_dependencies irrelevant but harmless in eager execution
          with ops.control_dependencies([fetched]):
            g = d.extended.reduce_to(
                reduce_util.ReduceOp.SUM, g, destinations=v)
            with ops.control_dependencies(
                d.extended.update(v, update, args=(g,), group=False)):
              after_list.append(d.extended.read_var(v))
        return before_list, after_list

      for i in range(10):
        b, a = step()
        if i == 0:
          before, = b  # pylint: disable=unbalanced-tuple-unpacking
        after, = a  # pylint: disable=unbalanced-tuple-unpacking

      error_before = abs(before.numpy() - 1)
      error_after = abs(after.numpy() - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
Exemple #35
0
    def test_feature_column_dense_features_gradient(self):
        with context.eager_mode():
            sparse_input = sparse_tensor.SparseTensor(indices=((0, 0), (1, 0),
                                                               (2, 0)),
                                                      values=(0, 1, 2),
                                                      dense_shape=(3, 3))

            # Create feature columns (categorical and embedding).
            categorical_column = fc.categorical_column_with_identity(
                key='a', num_buckets=3)
            embedding_dimension = 2

            def _embedding_column_initializer(shape,
                                              dtype,
                                              partition_info=None):
                del shape  # unused
                del dtype  # unused
                del partition_info  # unused
                embedding_values = (
                    (1, 0),  # id 0
                    (0, 1),  # id 1
                    (1, 1))  # id 2
                return embedding_values

            embedding_column = fc.embedding_column(
                categorical_column,
                dimension=embedding_dimension,
                initializer=_embedding_column_initializer)

            dense_features = df.DenseFeatures([embedding_column])
            features = {'a': sparse_input}

            def scale_matrix():
                matrix = dense_features(features)
                return 2 * matrix

            # Sanity check: Verify that scale_matrix returns the correct output.
            self.assertAllEqual([[2, 0], [0, 2], [2, 2]], scale_matrix())

            # Check that the returned gradient is correct.
            grad_function = backprop.implicit_grad(scale_matrix)
            grads_and_vars = grad_function()
            indexed_slice = grads_and_vars[0][0]
            gradient = grads_and_vars[0][0].values

            self.assertAllEqual([0, 1, 2], indexed_slice.indices)
            self.assertAllEqual([[2, 2], [2, 2], [2, 2]], gradient)
  def _test_minimize_loss_eager(self, d):
    with d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y
      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
      # common `implicit_grad` function and put it in DistributionStrategy.
      grad_fn = backprop.implicit_grad(loss)
      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

      def update(v, g):
        return v.assign_sub(0.2 * g)

      one = constant_op.constant([[1.]])

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.extended.read_var(v)
          before_list.append(fetched)
          # control_dependencies irrelevant but harmless in eager execution
          with ops.control_dependencies([fetched]):
            g = d.extended.reduce_to(
                reduce_util.ReduceOp.SUM, g, destinations=v)
            with ops.control_dependencies(
                d.extended.update(v, update, args=(g,), group=False)):
              after_list.append(d.extended.read_var(v))
        return before_list, after_list

      for i in range(10):
        b, a = step()
        if i == 0:
          before, = b  # pylint: disable=unbalanced-tuple-unpacking
        after, = a  # pylint: disable=unbalanced-tuple-unpacking

      error_before = abs(before.numpy() - 1)
      error_after = abs(after.numpy() - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
Exemple #37
0
    def testImplicitGradientsCustomGradientAndCachedVariableValue(self):
        @custom_gradient.custom_gradient
        def my_square(x):
            result = math_ops.square(x)

            def grad(dr):
                return 2 * dr * x + 1

            return result, grad

        x = resource_variable_ops.ResourceVariable(initial_value=3,
                                                   name='X.' + self.id())

        def f():
            return my_square(x)

        g = backprop.implicit_grad(f)

        grads_and_vars = g()
        self.assertEqual(1, len(grads_and_vars))
        grad, var = grads_and_vars[0]
        self.assertAllEqual(7, grad)
        self.assertAllEqual(x, var)
  def testImplicitGradOverEmbeddingLookup(self):
    batch_size = 8
    embedding_size = 512
    vocab_size = 1000
    lrn_rate = 0.1
    random_init = random_ops.random_uniform([vocab_size, embedding_size])

    x = array_ops.ones((batch_size), dtypes.int64)
    embedding = resource_variable_ops.ResourceVariable(
        initial_value=random_init, dtype=dtypes.float32, name='embedding')

    def f():
      tape.watch_variable(embedding)
      embedded_x = embedding_ops.embedding_lookup(embedding, x)
      return constant_op.constant(1.0, dtypes.float32) - embedded_x

    grad = backprop.implicit_grad(f)()[0][0]
    opt = training.GradientDescentOptimizer(lrn_rate)

    with context.graph_mode(), self.test_session():
      tf_x = array_ops.ones((batch_size), dtypes.int64)
      # TODO(ashankar,apassos): Change to ResourceVariable.
      tf_embedding = variables.Variable(
          random_init.numpy(), name='tf_embedding')
      tf_embedded_x = embedding_ops.embedding_lookup(tf_embedding, tf_x)
      tf_y = 1.0 - tf_embedded_x
      tf_grad = gradients.gradients(tf_y, [tf_embedding])[0]
      tf_opt = training.GradientDescentOptimizer(0.1)
      tf_embedding.initializer.run()

      self.assertAllClose(tf_grad.indices.eval(), grad.indices)
      self.assertAllClose(tf_grad.values.eval(), grad.values)

      tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
      expected = tf_embedding.eval()
    opt.apply_gradients([(grad, embedding)])
    self.assertAllClose(expected, embedding.read_value())
Exemple #39
0
    def testImplicitGradOverEmbeddingLookup(self):
        batch_size = 8
        embedding_size = 512
        vocab_size = 1000
        lrn_rate = 0.1
        random_init = random_ops.random_uniform([vocab_size, embedding_size])

        x = array_ops.ones((batch_size), dtypes.int64)
        embedding = resource_variable_ops.ResourceVariable(
            initial_value=random_init, dtype=dtypes.float32, name='embedding')

        def f():
            tape.watch_variable(embedding)
            embedded_x = embedding_ops.embedding_lookup(embedding, x)
            return constant_op.constant(1.0, dtypes.float32) - embedded_x

        grad = backprop.implicit_grad(f)()[0][0]
        opt = training.GradientDescentOptimizer(lrn_rate)

        with context.graph_mode(), self.test_session():
            tf_x = array_ops.ones((batch_size), dtypes.int64)
            # TODO(ashankar,apassos): Change to ResourceVariable.
            tf_embedding = variables.Variable(random_init.numpy(),
                                              name='tf_embedding')
            tf_embedded_x = embedding_ops.embedding_lookup(tf_embedding, tf_x)
            tf_y = 1.0 - tf_embedded_x
            tf_grad = gradients.gradients(tf_y, [tf_embedding])[0]
            tf_opt = training.GradientDescentOptimizer(0.1)
            tf_embedding.initializer.run()

            self.assertAllClose(tf_grad.indices.eval(), grad.indices)
            self.assertAllClose(tf_grad.values.eval(), grad.values)

            tf_opt.apply_gradients([(tf_grad, tf_embedding)]).run()
            expected = tf_embedding.eval()
        opt.apply_gradients([(grad, embedding)])
        self.assertAllClose(expected, embedding.read_value())
Exemple #40
0
  def testImplicitGradientsCustomGradientAndCachedVariableValue(self):

    @custom_gradient.custom_gradient
    def my_square(x):
      result = math_ops.square(x)

      def grad(dr):
        return 2 * dr * x + 1

      return result, grad

    x = resource_variable_ops.ResourceVariable(
        initial_value=3., name='X.' + self.id())

    def f():
      return my_square(x)

    g = backprop.implicit_grad(f)

    grads_and_vars = g()
    self.assertEqual(1, len(grads_and_vars))
    grad, var = grads_and_vars[0]
    self.assertAllEqual(7, grad)
    self.assertAllEqual(x, var)
    def step():
      def inner():
        return v * v

      return backprop.implicit_grad(inner)()[0][0]
Exemple #42
0
        def step():
            def inner():
                tape.watch_variable(v)
                return v * v

            return backprop.implicit_grad(inner)()[0][0]
Exemple #43
0
        def step():
            def inner():
                return v * v

            return backprop.implicit_grad(inner)()[0][0]
Exemple #44
0
        def step():
            def inner():
                tape.watch(v.handle)
                return v * v

            return backprop.implicit_grad(inner)()[0][1]
    def _test_minimize_loss_graph(self,
                                  d,
                                  soft_placement=False,
                                  learning_rate=0.2):
        config = config_pb2.ConfigProto()
        config.allow_soft_placement = soft_placement
        config.gpu_options.per_process_gpu_memory_fraction = 0.3
        with context.graph_mode(), \
             ops.Graph().as_default(), \
             self.cached_session(config=config) as sess, \
             d.scope():
            l = core.Dense(1, use_bias=False)

            def loss(x):
                # TODO(josh11b): What if this constant was instead a captured
                # value?  Would it need to be a value that has been passed
                # through d.broadcast()?
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            grad_fn = backprop.implicit_grad(loss)

            def update(v, g):
                return v.assign_sub(learning_rate * g)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=(one, ))

                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()
            variables.global_variables_initializer().run()
            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
 def train():
   grad = backprop.implicit_grad(loss)()
   optimizer.apply_gradients(grad)
 def train():
   v = resource_variable_ops.ResourceVariable(1.0)
   grad = backprop.implicit_grad(loss)(v)
   optimizer.apply_gradients(grad)
   return v.read_value()
Exemple #48
0
 def train():
     self.v = resource_variable_ops.ResourceVariable(1.0)
     grad = backprop.implicit_grad(loss)(self.v)
     optimizer.apply_gradients(grad)
     return self.v.read_value()
Exemple #49
0
 def train():
     grad = backprop.implicit_grad(loss)()
     optimizer.apply_gradients(grad)
Exemple #50
0
    def compute_gradients(self,
                          loss,
                          var_list=None,
                          gate_gradients=GATE_OP,
                          aggregation_method=None,
                          colocate_gradients_with_ops=False,
                          grad_loss=None):
        """Compute gradients of `loss` for the variables in `var_list`.

    This is the first part of `minimize()`.  It returns a list
    of (gradient, variable) pairs where "gradient" is the gradient
    for "variable".  Note that "gradient" can be a `Tensor`, an
    `IndexedSlices`, or `None` if there is no gradient for the
    given variable.

    Args:
      loss: A Tensor containing the value to minimize.
      var_list: Optional list or tuple of `tf.Variable` to update to minimize
        `loss`.  Defaults to the list of variables collected in the graph
        under the key `GraphKeys.TRAINABLE_VARIABLES`.
      gate_gradients: How to gate the computation of gradients.  Can be
        `GATE_NONE`, `GATE_OP`, or `GATE_GRAPH`.
      aggregation_method: Specifies the method used to combine gradient terms.
        Valid values are defined in the class `AggregationMethod`.
      colocate_gradients_with_ops: If True, try colocating gradients with
        the corresponding op.
      grad_loss: Optional. A `Tensor` holding the gradient computed for `loss`.

    Returns:
      A list of (gradient, variable) pairs. Variable is always present, but
      gradient can be `None`.

    Raises:
      TypeError: If `var_list` contains anything else than `Variable` objects.
      ValueError: If some arguments are invalid.
      RuntimeError: If called with eager execution enabled and if `grad_loss`
        is not `None` or `loss` is not callable.

    @compatibility(eager)
    When eager execution is enabled, `loss` should be a Python function that
    takes elements of `var_list` as arguments and computes the value to be
    minimized. If `var_list` is None, `loss` should take no arguments.
    Gradient computation is done with respect to the elements of `var_list` if
    not None, else with respect to any trainable variables created during the
    execution of the `loss` function.
    `gate_gradients`, `aggregation_method`, `colocate_gradients_with_ops` and
    `grad_loss` are ignored when eager execution is enabled.
    @end_compatibility
    """
        if context.in_eager_mode():
            if grad_loss is not None:
                raise RuntimeError(
                    "`grad_loss` argument to Optimizer.compute_gradients "
                    "not supported when eager execution is enabled.")
            if not callable(loss):
                raise RuntimeError(
                    "`loss` passed to Optimizer.compute_gradients should "
                    "be a function when eager execution is enabled.")
            # TODO (agarwal): consider passing parameters to the `loss` function. id:2636 gh:2637
            if var_list is None:
                return backprop.implicit_grad(loss)()
            else:
                var_list = nest.flatten(var_list)
                grads = backprop.gradients_function(loss)(*var_list)
                grads_and_vars = list(zip(grads, var_list))
                return grads_and_vars
        if gate_gradients not in [
                Optimizer.GATE_NONE, Optimizer.GATE_OP, Optimizer.GATE_GRAPH
        ]:
            raise ValueError(
                "gate_gradients must be one of: Optimizer.GATE_NONE, "
                "Optimizer.GATE_OP, Optimizer.GATE_GRAPH.  Not %s" %
                gate_gradients)
        self._assert_valid_dtypes([loss])
        if grad_loss is not None:
            self._assert_valid_dtypes([grad_loss])
        if var_list is None:
            var_list = (
                variables.trainable_variables() +
                ops.get_collection(ops.GraphKeys.TRAINABLE_RESOURCE_VARIABLES))
        else:
            var_list = nest.flatten(var_list)
        # pylint: disable=protected-access
        var_list += ops.get_collection(ops.GraphKeys._STREAMING_MODEL_PORTS)
        # pylint: enable=protected-access
        processors = [_get_processor(v) for v in var_list]
        if not var_list:
            raise ValueError("No variables to optimize.")
        var_refs = [p.target() for p in processors]
        grads = gradients.gradients(
            loss,
            var_refs,
            grad_ys=grad_loss,
            gate_gradients=(gate_gradients == Optimizer.GATE_OP),
            aggregation_method=aggregation_method,
            colocate_gradients_with_ops=colocate_gradients_with_ops)
        if gate_gradients == Optimizer.GATE_GRAPH:
            grads = control_flow_ops.tuple(grads)
        grads_and_vars = list(zip(grads, var_list))
        self._assert_valid_dtypes([
            v for g, v in grads_and_vars
            if g is not None and v.dtype != dtypes.resource
        ])
        return grads_and_vars
    def _test_minimize_loss_graph(self,
                                  d,
                                  soft_placement=False,
                                  learning_rate=0.2):
        config = config_pb2.ConfigProto()
        config.allow_soft_placement = soft_placement
        config.gpu_options.per_process_gpu_memory_fraction = 0.3
        with context.graph_mode(), \
             ops.Graph().as_default(), \
             self.cached_session(config=config) as sess, \
             d.scope():
            kernel = create_variable_like_keras_layer(name="kernel",
                                                      shape=(1, 1),
                                                      dtype=dtypes.float32)

            def loss(x):
                y = array_ops.reshape(gen_math_ops.mat_mul(x, kernel),
                                      []) - array_ops.identity(1.)
                return y * y

            grad_fn = backprop.implicit_grad(loss)

            def update(v, g):
                return v.assign_sub(learning_rate * g)

            one = array_ops.identity([[1.]])

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=(one, ))

                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    with ops.control_dependencies([fetched]):
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            before_out, after_out = step()
            variables.global_variables_initializer().run()
            for i in range(10):
                b, a = sess.run((before_out, after_out))
                if i == 0:
                    before, = b
                after, = a

            error_before = abs(before - 1)
            error_after = abs(after - 1)
            # Error should go down
            self.assertLess(error_after, error_before)