Ejemplo n.º 1
0
  def testTrain(self, distribution):
    with distribution.scope():
      mock_model = MiniModel()
      mock_model.call = function.defun(mock_model.call)

      def loss_fn(ctx):
        del ctx
        return mock_model(array_ops.ones([1, 10]))

      gradients_fn = backprop.implicit_grad(loss_fn)
      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)
      grads_and_vars = distribution.extended.call_for_each_replica(
          gradients_fn, args=(None,))

      optimizer = gradient_descent.GradientDescentOptimizer(0.25)
      update_ops = optimizer._distributed_apply(distribution, grads_and_vars)  # pylint: disable=protected-access

      if not context.executing_eagerly():
        self.evaluate(variables.global_variables_initializer())
        self.evaluate(update_ops)

      updated_var_values = self.evaluate(mock_model.variables)
      # All variables start at 1.0 and get two updates of 0.25.
      self.assertAllEqual(0.5 * np.ones([10, 1]), updated_var_values[0])
      self.assertAllEqual([0.5], updated_var_values[1])
Ejemplo n.º 2
0
    def _test_minimize_loss_eager(self, d):
        with d.scope():
            l = core.Dense(1, use_bias=False)

            def loss(x):
                # TODO(josh11b): What if this constant was instead a captured
                # value?  Would it need to be a value that has been passed
                # through d.broadcast()?
                y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
                return y * y

            # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
            # common `implicit_grad` function and put it in DistributionStrategy.
            grad_fn = backprop.implicit_grad(loss)
            grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

            def update(v, g):
                return v.assign_sub(0.2 * g)

            one = d.broadcast(constant_op.constant([[1.]]))

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=(one, ))

                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    # control_dependencies irrelevant but harmless in eager execution
                    with ops.control_dependencies([fetched]):
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            for i in range(10):
                b, a = step()
                if i == 0:
                    before, = b  # pylint: disable=unbalanced-tuple-unpacking
                after, = a  # pylint: disable=unbalanced-tuple-unpacking

            error_before = abs(before.numpy() - 1)
            error_after = abs(after.numpy() - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
Ejemplo n.º 3
0
    def _test_minimize_loss_eager(self, d):
        with d.scope():
            kernel = create_variable_like_keras_layer(name="kernel",
                                                      shape=(1, 1),
                                                      dtype=dtypes.float32)

            def loss(x):
                y = array_ops.reshape(gen_math_ops.mat_mul(x, kernel),
                                      []) - array_ops.identity(1.)
                return y * y

            # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
            # common `implicit_grad` function and put it in DistributionStrategy.
            grad_fn = backprop.implicit_grad(loss)
            grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

            def update(v, g):
                return v.assign_sub(0.2 * g)

            one = array_ops.identity([[1.]])

            def step():
                """Perform one optimization step."""
                # Run forward & backward to get gradients, variables list.
                g_v = d.extended.call_for_each_replica(grad_fn, args=(one, ))

                # Update the variables using the gradients and the update() function.
                before_list = []
                after_list = []
                for g, v in g_v:
                    fetched = d.extended.read_var(v)
                    before_list.append(fetched)
                    # control_dependencies irrelevant but harmless in eager execution
                    with ops.control_dependencies([fetched]):
                        g = d.extended.reduce_to(reduce_util.ReduceOp.SUM,
                                                 g,
                                                 destinations=v)
                        with ops.control_dependencies(
                                d.extended.update(v,
                                                  update,
                                                  args=(g, ),
                                                  group=False)):
                            after_list.append(d.extended.read_var(v))
                return before_list, after_list

            for i in range(10):
                b, a = step()
                if i == 0:
                    before, = b  # pylint: disable=unbalanced-tuple-unpacking
                after, = a  # pylint: disable=unbalanced-tuple-unpacking

            error_before = abs(before.numpy() - 1)
            error_after = abs(after.numpy() - 1)
            # Error should go down
            self.assertLess(error_after, error_before)
Ejemplo n.º 4
0
      def step_fn(ctx, *inputs):
        """Function to run one iteration with one input."""
        gradients_fn = backprop.implicit_grad(self._loss_fn)
        gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

        grads_and_vars = self.distribution.call_for_each_replica(
            gradients_fn, args=(ctx,) + inputs)
        # If threads use layers, then we need to run the first step
        # sequentially, so that layers.build() is not executed in parallel.
        # Otherwise, multiple sets of mirrored variables are going to be
        # created.
        return self._optimizer._distributed_apply(  # pylint: disable=protected-access
            self.distribution, grads_and_vars)
Ejemplo n.º 5
0
            def step_fn(ctx, inputs):
                """Function to run one iteration with one input."""
                gradients_fn = backprop.implicit_grad(self._loss_fn)
                gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

                grads_and_vars = self.distribution.extended.call_for_each_replica(
                    gradients_fn, args=(ctx, inputs))
                # If threads use layers, then we need to run the first step
                # sequentially, so that layers.build() is not executed in parallel.
                # Otherwise, multiple sets of mirrored variables are going to be
                # created.
                return self._optimizer._distributed_apply(  # pylint: disable=protected-access
                    self.distribution, grads_and_vars)
Ejemplo n.º 6
0
  def step(self, inputs):
    with self._distribution.scope():
      gradients_fn = backprop.implicit_grad(self._loss_fn)
      gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

      grads_and_vars = self.distribution.call_for_each_tower(
          gradients_fn, inputs, run_concurrently=self._is_run_concurrently)
      # If threads use layers, then we need to run the first step sequentially,
      # so that layers.build() is not executed in parallel.  Otherwise, multiple
      # sets of mirrored variables are going to be created.
      self._is_run_concurrently = True
      return self._optimizer._distributed_apply(  # pylint: disable=protected-access
          self.distribution, grads_and_vars)
Ejemplo n.º 7
0
  def _test_minimize_loss_eager(self, d):
    with d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        # TODO(josh11b): What if this constant was instead a captured
        # value?  Would it need to be a value that has been passed
        # through d.broadcast()?
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y
      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
      # common `implicit_grad` function and put it in DistributionStrategy.
      grad_fn = backprop.implicit_grad(loss)
      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

      def update(v, g):
        return v.assign_sub(0.2 * g)

      one = d.broadcast(constant_op.constant([[1.]]))

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.call_for_each_tower(grad_fn, one, run_concurrently=l.built)

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.read_var(v)
          before_list.append(fetched)
          # control_dependencies irrelevant but harmless in eager execution
          with ops.control_dependencies([fetched]):
            g = d.reduce(
                variable_scope.VariableAggregation.SUM, g, destinations=v)
            with ops.control_dependencies(d.update(
                v, update, g, grouped=False)):
              after_list.append(d.read_var(v))
        return before_list, after_list

      for i in range(10):
        b, a = step()
        if i == 0:
          before, = b  # pylint: disable=unbalanced-tuple-unpacking
        after, = a  # pylint: disable=unbalanced-tuple-unpacking

      error_before = abs(before.numpy() - 1)
      error_after = abs(after.numpy() - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
Ejemplo n.º 8
0
    def step(self, inputs):
        with self._distribution.scope():
            gradients_fn = backprop.implicit_grad(self._loss_fn)
            gradients_fn = optimizer_lib.get_filtered_grad_fn(gradients_fn)

            grads_and_vars = self.distribution.call_for_each_tower(
                gradients_fn,
                inputs,
                run_concurrently=self._is_run_concurrently)
            # If threads use layers, then we need to run the first step sequentially,
            # so that layers.build() is not executed in parallel.  Otherwise, multiple
            # sets of mirrored variables are going to be created.
            self._is_run_concurrently = True
            return self._optimizer._distributed_apply(  # pylint: disable=protected-access
                self.distribution, grads_and_vars)
Ejemplo n.º 9
0
  def _test_minimize_loss_eager(self, d):
    with d.scope():
      l = core.Dense(1, use_bias=False)

      def loss(x):
        y = array_ops.reshape(l(x), []) - constant_op.constant(1.)
        return y * y
      # TODO(isaprykin): Extract implicit_grad+get_filtered_grad_fn into a
      # common `implicit_grad` function and put it in DistributionStrategy.
      grad_fn = backprop.implicit_grad(loss)
      grad_fn = optimizer.get_filtered_grad_fn(grad_fn)

      def update(v, g):
        return v.assign_sub(0.2 * g)

      one = constant_op.constant([[1.]])

      def step():
        """Perform one optimization step."""
        # Run forward & backward to get gradients, variables list.
        g_v = d.extended.call_for_each_replica(grad_fn, args=(one,))

        # Update the variables using the gradients and the update() function.
        before_list = []
        after_list = []
        for g, v in g_v:
          fetched = d.extended.read_var(v)
          before_list.append(fetched)
          # control_dependencies irrelevant but harmless in eager execution
          with ops.control_dependencies([fetched]):
            g = d.extended.reduce_to(
                reduce_util.ReduceOp.SUM, g, destinations=v)
            with ops.control_dependencies(
                d.extended.update(v, update, args=(g,), group=False)):
              after_list.append(d.extended.read_var(v))
        return before_list, after_list

      for i in range(10):
        b, a = step()
        if i == 0:
          before, = b  # pylint: disable=unbalanced-tuple-unpacking
        after, = a  # pylint: disable=unbalanced-tuple-unpacking

      error_before = abs(before.numpy() - 1)
      error_after = abs(after.numpy() - 1)
      # Error should go down
      self.assertLess(error_after, error_before)
Ejemplo n.º 10
0
    def apply_gradients(self, loss, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.
        This is the second part of `minimize()`. It returns an `Operation` that
        applies gradients.
        Args:
        grads_and_vars: List of (gradient, variable) pairs as returned by
            `compute_gradients()`.
        global_step: Optional `Variable` to increment by one after the
            variables have been updated.
        name: Optional name for the returned operation.  Default to the
            name passed to the `Optimizer` constructor.
        Returns:
        An `Operation` that applies the specified gradients. If `global_step`
        was not None, that operation also increments `global_step`.
        Raises:
        TypeError: If `grads_and_vars` is malformed.
        ValueError: If none of the variables have gradients.
        RuntimeError: If you should use `_distributed_apply()` instead.
        """
        # This is a default implementation of apply_gradients() that can be shared
        # by most optimizers.  It relies on the subclass implementing the following
        # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().

        # Handle DistributionStrategy case.
        if distribution_strategy_context.get_cross_tower_context():
            raise RuntimeError("Use `_distributed_apply()` instead of `apply_gradients()` in a cross-tower context.")
        # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
        # always calling _distributed_apply(), using the default distribution
        # as needed.
        if distribution_strategy_context.has_distribution_strategy():
            grads_and_vars = optimizer.get_filtered_grad_fn(lambda: grads_and_vars)()
            return distribution_strategy_context.get_tower_context().merge_call(
                self._distributed_apply, grads_and_vars, global_step, name
            )

        # No DistributionStrategy case.
        grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
        if not grads_and_vars:
            raise ValueError("No variables provided.")
        converted_grads_and_vars = []
        for grad, var in grads_and_vars:
            if grad is not None:
                try:
                    # Convert the grad to Tensor or IndexedSlices if necessary.
                    grad = ops.convert_to_tensor_or_indexed_slices(grad)
                except TypeError:
                    raise TypeError("Gradient must be convertible to a Tensor or IndexedSlices, or None: %s" % grad)
                if not isinstance(grad, (ops.Tensor, ops.IndexedSlices)):
                    raise TypeError("Gradient must be a Tensor, IndexedSlices, or None: %s" % grad)
            processor = _get_processor(var)
            converted_grads_and_vars.append((grad, var, processor))

        converted_grads_and_vars = tuple(converted_grads_and_vars)
        var_list = [var for grad, var, _ in converted_grads_and_vars if grad is not None]
        if not var_list:
            raise ValueError("No gradients provided for any variable: %s." % ([str(var) for _, var, _ in converted_grads_and_vars],))
        with ops.init_scope():
            self._create_slots(var_list)
        update_ops = []
        with ops.name_scope(name, self._name) as name:
            self._prepare()
            for grad, var, processor in converted_grads_and_vars:
                if grad is None:
                    continue
                # We colocate all ops created in _apply_dense or _apply_sparse
                # on the same device as the variable.
                # TODO(apassos): figure out how to get the variable name here.
                if context.executing_eagerly() or isinstance(var, resource_variable_ops.ResourceVariable) and not var._in_graph_mode:
                    scope_name = ""
                else:
                    scope_name = var.op.name
                with ops.name_scope("update_" + scope_name), ops.colocate_with(var):
                    update_ops.append(processor.update_op(self, loss, grad, global_step))
            if global_step is None:
                apply_updates = self._finish(update_ops, loss, name)
            else:
                with ops.control_dependencies([self._finish(update_ops, loss, "update")]):
                    with ops.colocate_with(global_step):
                        if isinstance(global_step, resource_variable_ops.ResourceVariable):
                            # TODO(apassos): the implicit read in assign_add is slow; consider
                            # making it less so.
                            apply_updates = resource_variable_ops.assign_add_variable_op(
                                resource=global_step.handle,
                                value=ops.convert_to_tensor(
                                    value=1,
                                    dtype=global_step.dtype
                                ),
                                name=name
                            )
                        else:
                            apply_updates = state_ops.assign_add(
                                ref=global_step,
                                value=1,
                                name=name
                            )

            if not context.executing_eagerly():
                if isinstance(apply_updates, ops.Tensor):
                    apply_updates = apply_updates.op
                train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
                if apply_updates not in train_op:
                    train_op.append(apply_updates)

            return apply_updates