Esempio n. 1
0
  def _assign_func(self, *args, **kwargs):
    f = kwargs.pop("f")
    if distribution_strategy_context.get_cross_tower_context():
      update_device = distribute_lib.get_update_device()
      if update_device is not None:
        # We are calling an assign function in an update context.
        return f(self._v, *args, **kwargs)

      # We are calling an assign function in cross tower context, wrap it in an
      # update call.
      return distribution_strategy_context.get_distribution_strategy().update(
          self, f, *args, **kwargs)
    else:
      assert distribution_strategy_context.get_tower_context()
      # We are calling an assign function in tower context.
      # We reduce the value we want to assign/add/sub. More details about how we
      # handle the different use cases can be found in the _reduce method.
      # We call the function with the reduced value.
      if self._aggregation == vs.VariableAggregation.NONE:
        raise ValueError("You must specify an aggregation method to update a "
                         "a variable in Tower Context.")

      def merge_fn(strategy, value, *other_args, **other_kwargs):
        return strategy.update(
            self, f,
            strategy.reduce(
                aggregation=self._aggregation, value=value, destinations=self),
            *other_args, **other_kwargs)

      return distribution_strategy_context.get_tower_context().merge_call(
          merge_fn, *args, **kwargs)
Esempio n. 2
0
    def _assign_func(self, *args, **kwargs):
        f = kwargs.pop("f")
        if distribution_strategy_context.get_cross_tower_context():
            update_device = distribute_lib.get_update_device()
            if update_device is not None:
                # We are calling an assign function in an update context.
                return f(self._v, *args, **kwargs)

            # We are calling an assign function in cross tower context, wrap it in an
            # update call.
            return distribution_strategy_context.get_distribution_strategy(
            ).update(self, f, *args, **kwargs)
        else:
            assert distribution_strategy_context.get_tower_context()
            # We are calling an assign function in tower context.
            # We reduce the value we want to assign/add/sub. More details about how we
            # handle the different use cases can be found in the _reduce method.
            # We call the function with the reduced value.
            if self._aggregation == vs.VariableAggregation.NONE:
                raise ValueError(
                    "You must specify an aggregation method to update a "
                    "a variable in Tower Context.")

            def merge_fn(strategy, value, *other_args, **other_kwargs):
                return strategy.update(
                    self, f,
                    strategy.reduce(aggregation=self._aggregation,
                                    value=value,
                                    destinations=self), *other_args,
                    **other_kwargs)

            return distribution_strategy_context.get_tower_context(
            ).merge_call(merge_fn, *args, **kwargs)
def _assert_in_default_state(t):
  t.assertIs(distribution_strategy_context._get_default_tower_context(),
             distribution_strategy_context.get_tower_context())
  t.assertIs(None, distribution_strategy_context.get_cross_tower_context())
  t.assertIs(distribution_strategy_context._get_default_distribution_strategy(),
             distribution_strategy_context.get_distribution_strategy())
  t.assertFalse(distribution_strategy_context.has_distribution_strategy())
Esempio n. 4
0
def vq_discrete_bottleneck(x, hparams):
    """Simple vector quantized discrete bottleneck."""
    bottleneck_size = 2**hparams.bottleneck_bits
    x_shape = commons.shape_list(x)
    x = tf.reshape(x, [-1, hparams.hidden_size])
    x_means_hot, e_loss = vq_nearest_neighbor(x, hparams)

    if hparams.bottleneck_kind == "mog":
        loss = hparams.beta * e_loss
    else:
        tf.logging.info("Using EMA with beta = {}".format(hparams.beta))
        means, ema_means, ema_count = (hparams.means, hparams.ema_means,
                                       hparams.ema_count)
        # Update the ema variables
        updated_ema_count = commons.assign_moving_average(ema_count,
                                                          tf.reduce_sum(
                                                              x_means_hot,
                                                              axis=0),
                                                          hparams.decay,
                                                          zero_debias=False)

        dw = tf.matmul(x_means_hot, x, transpose_a=True)
        updated_ema_means = commons.assign_moving_average(ema_means,
                                                          dw,
                                                          hparams.decay,
                                                          zero_debias=False)
        n = tf.reduce_sum(updated_ema_count, axis=-1, keepdims=True)
        updated_ema_count = ((updated_ema_count + hparams.epsilon) /
                             (n + bottleneck_size * hparams.epsilon) * n)
        # pylint: disable=g-no-augmented-assignment
        updated_ema_means = updated_ema_means / tf.expand_dims(
            updated_ema_count, axis=-1)
        # pylint: enable=g-no-augmented-assignment
        with tf.control_dependencies([e_loss]):
            # distribution_strategy
            def update_fn(v, value):
                return tf.assign(v, value)

            tower_context = distribution_strategy_context.get_tower_context()
            if tower_context:

                def merge_fn(strategy, v, value):
                    value = strategy.reduce(tf.VariableAggregation.MEAN, value,
                                            v)
                    return strategy.update(v, update_fn, value)

                update_means = tower_context.merge_call(
                    merge_fn, means, updated_ema_means)
            else:
                strategy = distribution_strategy_context.get_cross_tower_context(
                )
                update_means = strategy.update(means, update_fn,
                                               updated_ema_means)
            with tf.control_dependencies([update_means]):
                loss = hparams.beta * e_loss

    discrete = tf.reshape(x_means_hot, x_shape[:-1] + [bottleneck_size])
    return discrete, loss
Esempio n. 5
0
 def op(self):
     # We want cross-tower code that does some var.op.X calls
     # to work (even if the current device isn't in self.devices), but
     # other uses of var.op in a cross-tower context to fail.
     if distribution_strategy_context.get_cross_tower_context():
         return DistributedVarOp(self._primary_var.op.name,
                                 self._primary_var.op.graph,
                                 self._primary_var.op.type)
     return self.get().op
Esempio n. 6
0
 def op(self):
   # We want cross-tower code that does some var.op.X calls
   # to work (even if the current device isn't in self.devices), but
   # other uses of var.op in a cross-tower context to fail.
   if distribution_strategy_context.get_cross_tower_context():
     return DistributedVarOp(self._primary_var.op.name,
                             self._primary_var.op.graph,
                             self._primary_var.op.type)
   return self.get().op
Esempio n. 7
0
 def set_non_tensor_output(self, name, output):
   """Set `output` with `name` to be captured as a non tensor output."""
   if distribution_strategy_context.get_cross_tower_context():
     self._non_tensor_outputs[name] = output
   else:
     def merge_fn(distribution, value):
       # NOTE(priyag): For non tensor outputs, we simply return all the values
       # in a list as aggregation doesn't make sense on non tensors.
       self._non_tensor_outputs[name] = distribution.unwrap(value)
     distribution_strategy_context.get_tower_context().merge_call(
         merge_fn, output)
Esempio n. 8
0
 def set_non_tensor_output(self, name, output):
   """Set `output` with `name` to be captured as a non tensor output."""
   if distribution_strategy_context.get_cross_tower_context():
     self._non_tensor_outputs[name] = output
   else:
     def merge_fn(distribution, value):
       # NOTE(priyag): For non tensor outputs, we simply return all the values
       # in a list as aggregation doesn't make sense on non tensors.
       self._non_tensor_outputs[name] = distribution.unwrap(value)
     distribution_strategy_context.get_tower_context().merge_call(
         merge_fn, output)
 def merge_fn(dist, s):
   self.assertIs(
       distribution_strategy_context._get_default_distribution_strategy(),
       dist)
   self.assertIs(None, distribution_strategy_context.get_tower_context())
   self.assertIs(dist,
                 distribution_strategy_context.get_cross_tower_context())
   self.assertIs(dist,
                 distribution_strategy_context.get_distribution_strategy())
   self.assertFalse(
       distribution_strategy_context.has_distribution_strategy())
   return "foo_" + s
Esempio n. 10
0
 def assign(self, *args, **kwargs):
   if distribution_strategy_context.get_cross_tower_context():
     # To preserve the sum across save and restore, we have to divide the
     # total across all devices when restoring a variable that was summed
     # when saving.
     tensor = args[0]
     if self._aggregation == vs.VariableAggregation.SUM:
       tensor *= 1. / len(self.devices)
     return control_flow_ops.group(
         [_assign_on_device(d, v, tensor)
          for d, v in six.iteritems(self._index)])
   else:
     _assert_tower_context()
     return self.get().assign(*args, **kwargs)
Esempio n. 11
0
 def run_fn():
   tower_context = distribution_strategy_context.get_tower_context()
   self.assertTrue(tower_context is not None)
   self.assertIs(None,
                 distribution_strategy_context.get_cross_tower_context())
   self.assertTrue(distribution_strategy_context.has_distribution_strategy())
   self.assertIs(dist,
                 distribution_strategy_context.get_distribution_strategy())
   self.assertEqual("foo", tower_context.merge_call(None, test_arg="foo"))
   expected_value = _get_test_variable(
       "bar", variable_scope.VariableSynchronization.AUTO,
       variable_scope.VariableAggregation.NONE)
   self.assertDictEqual(expected_value,
                        variable_scope.variable(1.0, name="bar"))
Esempio n. 12
0
    def _assign_func(self, *args, **kwargs):
        f = kwargs.pop("f")
        if distribution_strategy_context.get_cross_tower_context():
            update_device = distribute_lib.get_update_device()
            if update_device is not None:
                # We are calling an assign function on the mirrored variable in an
                # update context.
                v = self.get(device=update_device)
                return f(v, *args, **kwargs)

            # We are calling assign on the mirrored variable in cross tower context,
            # use update to update the variable.
            strategy = distribution_strategy_context.get_distribution_strategy(
            )
            updates = strategy.update(self, f, *args, **kwargs)
            grouped = strategy.group(updates)
            if isinstance(updates,
                          DistributedValues) and updates.is_tensor_like:
                # Make sure we run all updates. Without this, something like
                # session.run(mirrored_var.assign*(...)) may only update one tower.
                index = {}
                for d in updates.devices:
                    with ops.device(d), ops.control_dependencies([grouped]):
                        index[d] = array_ops.identity(updates.get(d))
                return Mirrored(index)
            else:
                return grouped
        else:
            _assert_tower_context()
            # We are calling an assign function on the mirrored variable in tower
            # context.
            # We reduce the value we want to assign/add/sub. More details about how we
            # handle the different use cases can be found in the _reduce method.
            # We call the function on each of the mirrored variables with the reduced
            # value.
            if self._aggregation == vs.VariableAggregation.NONE:
                raise ValueError(
                    "You must specify an aggregation method to update a "
                    "MirroredVariable in Tower Context.")

            def merge_fn(strategy, value, *other_args, **other_kwargs):
                return strategy.update(
                    self, f,
                    strategy.reduce(aggregation=self._aggregation,
                                    value=value,
                                    destinations=self), *other_args,
                    **other_kwargs)

            return distribution_strategy_context.get_tower_context(
            ).merge_call(merge_fn, *args, **kwargs)
Esempio n. 13
0
 def assign(self, *args, **kwargs):
   if distribution_strategy_context.get_cross_tower_context():
     # To preserve the sum across save and restore, we have to divide the
     # total across all devices when restoring a variable that was summed
     # when saving.
     tensor = args[0]
     if self._aggregation == vs.VariableAggregation.SUM:
       tensor *= 1. / len(self.devices)
     return control_flow_ops.group(
         [_assign_on_device(d, v, tensor)
          for d, v in six.iteritems(self._index)])
   else:
     _assert_tower_context()
     return self.get().assign(*args, **kwargs)
Esempio n. 14
0
 def testScope(self):
   _assert_in_default_state(self)
   dist = _TestStrategy()
   with dist.scope():
     self.assertIs(None, distribution_strategy_context.get_tower_context())
     self.assertIs(dist,
                   distribution_strategy_context.get_cross_tower_context())
     self.assertTrue(distribution_strategy_context.has_distribution_strategy())
     self.assertIs(dist,
                   distribution_strategy_context.get_distribution_strategy())
     expected_value = _get_test_variable(
         "baz", variable_scope.VariableSynchronization.AUTO,
         variable_scope.VariableAggregation.NONE)
     self.assertDictEqual(expected_value,
                          variable_scope.variable(1.0, name="baz"))
   _assert_in_default_state(self)
Esempio n. 15
0
    def set_last_step_output(
            self,
            name,
            output,
            aggregation=variables_lib.VariableAggregation.NONE):
        """Set `output` with `name` to be outputted from the last step.

    Args:
      name: String, name to identify the output. Doesn't need to match tensor
        name.
      output: The tensors that should be outputted with `name`. See below for
        actual types supported.
      aggregation: Aggregation method to use to aggregate outputs from multiple
        towers. Required if `set_last_step_output` is called in a tower context.
        Optional in cross_tower_context.
        When present, the outputs from all the towers are aggregated using the
        current distribution strategy's `reduce` method. Hence, the type of
        `output` must be what's supported by the corresponding `reduce` method.
        For e.g. if using MirroredStrategy and aggregation is set, output
        must be a `PerDevice` value.
        The aggregation method is also recorded in a dictionary
        `_last_step_outputs_aggregations` for later interpreting of the
        outputs as already reduced or not.

    """
        if distribution_strategy_context.get_cross_tower_context():
            self._last_step_outputs_aggregations[name] = aggregation
            if aggregation is variables_lib.VariableAggregation.NONE:
                self._last_step_outputs[name] = output
            else:
                distribution = distribution_strategy_context.get_distribution_strategy(
                )
                self._last_step_outputs[name] = distribution.reduce(
                    aggregation, output, destinations="/device:CPU:0")
        else:
            assert aggregation is not variables_lib.VariableAggregation.NONE

            def merge_fn(distribution, value):
                self._last_step_outputs[name] = distribution.reduce(
                    aggregation, value, destinations="/device:CPU:0")
                # Setting this inside the `merge_fn` because all towers share the same
                # context object, so it's more robust to set it only once (even if all
                # the towers are trying to set the same value).
                self._last_step_outputs_aggregations[name] = aggregation

            distribution_strategy_context.get_tower_context().merge_call(
                merge_fn, output)
Esempio n. 16
0
  def _assign_func(self, *args, **kwargs):
    f = kwargs.pop("f")
    if distribution_strategy_context.get_cross_tower_context():
      update_device = distribute_lib.get_update_device()
      if update_device is not None:
        # We are calling an assign function on the mirrored variable in an
        # update context.
        v = self.get(device=update_device)
        return f(v, *args, **kwargs)

      # We are calling assign on the mirrored variable in cross tower context,
      # use update to update the variable.
      strategy = distribution_strategy_context.get_distribution_strategy()
      updates = strategy.update(self, f, *args, **kwargs)
      grouped = strategy.group(updates)
      if isinstance(updates, DistributedValues) and updates.is_tensor_like:
        # Make sure we run all updates. Without this, something like
        # session.run(mirrored_var.assign*(...)) may only update one tower.
        index = {}
        for d in updates.devices:
          with ops.device(d), ops.control_dependencies([grouped]):
            index[d] = array_ops.identity(updates.get(d))
        return Mirrored(index)
      else:
        return grouped
    else:
      _assert_tower_context()
      # We are calling an assign function on the mirrored variable in tower
      # context.
      # We reduce the value we want to assign/add/sub. More details about how we
      # handle the different use cases can be found in the _reduce method.
      # We call the function on each of the mirrored variables with the reduced
      # value.
      if self._aggregation == vs.VariableAggregation.NONE:
        raise ValueError("You must specify an aggregation method to update a "
                         "MirroredVariable in Tower Context.")

      def merge_fn(strategy, value, *other_args, **other_kwargs):
        return strategy.update(
            self, f,
            strategy.reduce(
                aggregation=self._aggregation, value=value, destinations=self),
            *other_args, **other_kwargs)

      return distribution_strategy_context.get_tower_context().merge_call(
          merge_fn, *args, **kwargs)
Esempio n. 17
0
  def set_last_step_output(self, name, output,
                           aggregation=variables_lib.VariableAggregation.NONE):
    """Set `output` with `name` to be outputted from the last step.

    Args:
      name: String, name to identify the output. Doesn't need to match tensor
        name.
      output: The tensors that should be outputted with `name`. See below for
        actual types supported.
      aggregation: Aggregation method to use to aggregate outputs from multiple
        towers. Required if `set_last_step_output` is called in a tower context.
        Optional in cross_tower_context.
        When present, the outputs from all the towers are aggregated using the
        current distribution strategy's `reduce` method. Hence, the type of
        `output` must be what's supported by the corresponding `reduce` method.
        For e.g. if using MirroredStrategy and aggregation is set, output
        must be a `PerDevice` value.
        The aggregation method is also recorded in a dictionary
        `_last_step_outputs_aggregations` for later interpreting of the
        outputs as already reduced or not.

    """
    if distribution_strategy_context.get_cross_tower_context():
      self._last_step_outputs_aggregations[name] = aggregation
      if aggregation is variables_lib.VariableAggregation.NONE:
        self._last_step_outputs[name] = output
      else:
        distribution = distribution_strategy_context.get_distribution_strategy()
        self._last_step_outputs[name] = distribution.reduce(
            aggregation, output, destinations="/device:CPU:0")
    else:
      assert aggregation is not variables_lib.VariableAggregation.NONE
      def merge_fn(distribution, value):
        self._last_step_outputs[name] = distribution.reduce(
            aggregation, value, destinations="/device:CPU:0")
        # Setting this inside the `merge_fn` because all towers share the same
        # context object, so it's more robust to set it only once (even if all
        # the towers are trying to set the same value).
        self._last_step_outputs_aggregations[name] = aggregation

      distribution_strategy_context.get_tower_context().merge_call(
          merge_fn, output)
Esempio n. 18
0
def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
  """Compute the moving average of a variable.

  The moving average of 'variable' updated with 'value' is:
    variable * decay + value * (1 - decay)

  The returned Operation sets 'variable' to the newly computed moving average,
  by performing this subtraction:
     variable -= (1 - decay) * (variable - value)

  Since variables that are initialized to a `0` value will be `0` biased,
  `zero_debias` optionally enables scaling by the mathematically correct
  debiasing factor of
    1 - decay ** num_updates
  See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
  (https://arxiv.org/abs/1412.6980).

  The names of the debias shadow variables, by default, include both the scope
  they were created in and the scope of the variables they debias. They are also
  given a uniquifying-suffix.

  E.g.:

  ```
    with tf.variable_scope('scope1'):
      with tf.variable_scope('scope2'):
        var = tf.get_variable('foo')
        update_1 = tf.assign_moving_average(var, 0.0, 1.0)
        update_2 = tf.assign_moving_average(var, 0.0, 0.9)

    # var.name: 'scope1/scope2/foo'
    # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
    #                   'scope1/scope2/scope1/scope2/foo/biased_1'
  ```

  Args:
    variable: A Variable.
    value: A tensor with the same shape as 'variable'.
    decay: A float Tensor or float value.  The moving average decay.
    zero_debias: A python bool. If true, assume the variable is 0-initialized
      and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
      `_zero_debias` for more details.
    name: Optional name of the returned operation.

  Returns:
    A tensor which if evaluated will compute and return the new moving average.
  """
  def update_fn(v, value, decay=decay):
    decay = ops.convert_to_tensor(1.0 - decay, name="decay")
    if decay.dtype != v.dtype.base_dtype:
      decay = math_ops.cast(decay, v.dtype.base_dtype)
    if zero_debias:
      update_delta = _zero_debias(v, value, decay)
    else:
      update_delta = (v - value) * decay
    return state_ops.assign_sub(v, update_delta, name=scope)

  with ops.name_scope(name, "AssignMovingAvg",
                      [variable, value, decay]) as scope:
    tower_context = distribution_strategy_context.get_tower_context()
    if tower_context:
      # In a tower context, we update variable using the mean of value across
      # towers.
      def merge_fn(strategy, v, value):
        value = strategy.reduce(
            variable_scope.VariableAggregation.MEAN, value, v)
        return strategy.update(v, update_fn, value)

      return tower_context.merge_call(merge_fn, variable, value)
    else:
      strategy = distribution_strategy_context.get_cross_tower_context()
      return strategy.update(variable, update_fn, value)
Esempio n. 19
0
def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
    """Compute the moving average of a variable.
  https://github.com/tensorflow/tensorflow/blob/c966b5eed60a570f2121cb84ddb4ece84c413719/tensorflow/python/training/moving_averages.py
  """
    def _zero_debias(unbiased_var, value, decay):
        """Compute the delta required for a debiased Variable.
    """
        with tf.variable_scope(unbiased_var.op.name,
                               values=[unbiased_var, value, decay]) as scope:
            with tf.init_scope():
                biased_initializer = tf.zeros_initializer(
                    dtype=unbiased_var.dtype)(unbiased_var.get_shape())
                local_step_initializer = tf.zeros_initializer()

            def _maybe_get_unique(name):
                """Get name for a unique variable, if not `reuse=True`."""
                if tf.get_variable_scope().reuse:
                    return name
                vs_vars = [
                    x.op.name
                    for x in tf.get_variable_scope().global_variables()
                ]
                full_name = tf.get_variable_scope().name + "/" + name
                if full_name not in vs_vars: return name
                idx = 1
                while full_name + ("_%d" % idx) in vs_vars:
                    idx += 1
                return name + ("_%d" % idx)

            biased_var = tf.get_variable(_maybe_get_unique("biased"),
                                         initializer=biased_initializer,
                                         trainable=False)
            local_step = tf.get_variable(_maybe_get_unique("local_step"),
                                         shape=[],
                                         dtype=unbiased_var.dtype,
                                         initializer=local_step_initializer,
                                         trainable=False)

            # Get an update ops for both shadow variables.
            update_biased = tf.assign_sub(biased_var,
                                          (biased_var - value) * decay,
                                          name=scope.name)
            update_local_step = local_step.assign_add(1)

            # Compute the value of the delta to update the unbiased EMA. Make sure to
            # use the new values of the biased variable and the local step.
            with tf.control_dependencies([update_biased, update_local_step]):
                # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
                unbiased_ema_delta = (
                    unbiased_var - biased_var.read_value() /
                    (1 - tf.pow(1.0 - decay, local_step.read_value())))

            return unbiased_ema_delta

    def update_fn(v, value, decay=decay):
        decay = tf.convert_to_tensor(1.0 - decay, name="decay")
        if decay.dtype != v.dtype.base_dtype:
            decay = tf.cast(decay, v.dtype.base_dtype)
        if zero_debias:
            update_delta = _zero_debias(v, value, decay)
        else:
            update_delta = (v - value) * decay
        return tf.assign_sub(v, update_delta, name=scope)

    with tf.name_scope(name, "AssignMovingAvg",
                       [variable, value, decay]) as scope:
        tower_context = distribution_strategy_context.get_tower_context()
        if tower_context:
            # In a tower context, we update variable using the mean of value across
            # towers.
            def merge_fn(strategy, v, value):
                try:
                    value = strategy.reduce(tf.VariableAggregation.MEAN, value,
                                            v)
                except:
                    pass  # Mirrored variables are loaded
                return strategy.update(v, update_fn, value)

            return tower_context.merge_call(merge_fn, variable, value)
        else:
            strategy = distribution_strategy_context.get_cross_tower_context()
            return strategy.update(variable, update_fn, value)
Esempio n. 20
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    This is the second part of `minimize()`. It returns an `Operation` that
    applies gradients.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        `compute_gradients()`.
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the `Optimizer` constructor.

    Returns:
      An `Operation` that applies the specified gradients. If `global_step`
      was not None, that operation also increments `global_step`.

    Raises:
      TypeError: If `grads_and_vars` is malformed.
      ValueError: If none of the variables have gradients.
      RuntimeError: If you should use `_distributed_apply()` instead.
    """
    # This is a default implementation of apply_gradients() that can be shared
    # by most optimizers.  It relies on the subclass implementing the following
    # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().

    # Handle DistributionStrategy case.
    if distribution_strategy_context.get_cross_tower_context():
      raise RuntimeError("Use `_distributed_apply()` instead of "
                         "`apply_gradients()` in a cross-tower context.")
    # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
    # always calling _distributed_apply(), using the default distribution
    # as needed.
    if distribution_strategy_context.has_distribution_strategy():
      grads_and_vars = get_filtered_grad_fn(lambda: grads_and_vars)()
      return distribution_strategy_context.get_tower_context().merge_call(
          self._distributed_apply, grads_and_vars, global_step, name)

    # No DistributionStrategy case.
    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
    if not grads_and_vars:
      raise ValueError("No variables provided.")
    converted_grads_and_vars = []
    for g, v in grads_and_vars:
      if g is not None:
        try:
          # Convert the grad to Tensor or IndexedSlices if necessary.
          g = ops.convert_to_tensor_or_indexed_slices(g)
        except TypeError:
          raise TypeError(
              "Gradient must be convertible to a Tensor"
              " or IndexedSlices, or None: %s" % g)
        if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
          raise TypeError(
              "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
      p = _get_processor(v)
      converted_grads_and_vars.append((g, v, p))

    converted_grads_and_vars = tuple(converted_grads_and_vars)
    var_list = [v for g, v, _ in converted_grads_and_vars if g is not None]
    if not var_list:
      raise ValueError("No gradients provided for any variable: %s." %
                       ([str(v) for _, _, v in converted_grads_and_vars],))
    with ops.init_scope():
      self._create_slots(var_list)
    update_ops = []
    with ops.name_scope(name, self._name) as name:
      self._prepare()
      for grad, var, processor in converted_grads_and_vars:
        if grad is None:
          continue
        # We colocate all ops created in _apply_dense or _apply_sparse
        # on the same device as the variable.
        # TODO(apassos): figure out how to get the variable name here.
        if context.executing_eagerly() or isinstance(
            var,
            resource_variable_ops.ResourceVariable) and not var._in_graph_mode:  # pylint: disable=protected-access
          scope_name = ""
        else:
          scope_name = var.op.name
        with ops.name_scope("update_" + scope_name), ops.colocate_with(var):
          update_ops.append(processor.update_op(self, grad))
      if global_step is None:
        apply_updates = self._finish(update_ops, name)
      else:
        with ops.control_dependencies([self._finish(update_ops, "update")]):
          with ops.colocate_with(global_step):
            if isinstance(global_step, resource_variable_ops.ResourceVariable):
              # TODO(apassos): the implicit read in assign_add is slow; consider
              # making it less so.
              apply_updates = resource_variable_ops.assign_add_variable_op(
                  global_step.handle,
                  ops.convert_to_tensor(1, dtype=global_step.dtype),
                  name=name)
            else:
              apply_updates = state_ops.assign_add(global_step, 1, name=name)

      if not context.executing_eagerly():
        if isinstance(apply_updates, ops.Tensor):
          apply_updates = apply_updates.op
        train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
        if apply_updates not in train_op:
          train_op.append(apply_updates)

      return apply_updates
Esempio n. 21
0
    def apply_gradients(self, loss, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.
        This is the second part of `minimize()`. It returns an `Operation` that
        applies gradients.
        Args:
        grads_and_vars: List of (gradient, variable) pairs as returned by
            `compute_gradients()`.
        global_step: Optional `Variable` to increment by one after the
            variables have been updated.
        name: Optional name for the returned operation.  Default to the
            name passed to the `Optimizer` constructor.
        Returns:
        An `Operation` that applies the specified gradients. If `global_step`
        was not None, that operation also increments `global_step`.
        Raises:
        TypeError: If `grads_and_vars` is malformed.
        ValueError: If none of the variables have gradients.
        RuntimeError: If you should use `_distributed_apply()` instead.
        """
        # This is a default implementation of apply_gradients() that can be shared
        # by most optimizers.  It relies on the subclass implementing the following
        # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().

        # Handle DistributionStrategy case.
        if distribution_strategy_context.get_cross_tower_context():
            raise RuntimeError("Use `_distributed_apply()` instead of `apply_gradients()` in a cross-tower context.")
        # TODO(isaprykin): Get rid of `has_distribution_strategy()` check by
        # always calling _distributed_apply(), using the default distribution
        # as needed.
        if distribution_strategy_context.has_distribution_strategy():
            grads_and_vars = optimizer.get_filtered_grad_fn(lambda: grads_and_vars)()
            return distribution_strategy_context.get_tower_context().merge_call(
                self._distributed_apply, grads_and_vars, global_step, name
            )

        # No DistributionStrategy case.
        grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works.
        if not grads_and_vars:
            raise ValueError("No variables provided.")
        converted_grads_and_vars = []
        for grad, var in grads_and_vars:
            if grad is not None:
                try:
                    # Convert the grad to Tensor or IndexedSlices if necessary.
                    grad = ops.convert_to_tensor_or_indexed_slices(grad)
                except TypeError:
                    raise TypeError("Gradient must be convertible to a Tensor or IndexedSlices, or None: %s" % grad)
                if not isinstance(grad, (ops.Tensor, ops.IndexedSlices)):
                    raise TypeError("Gradient must be a Tensor, IndexedSlices, or None: %s" % grad)
            processor = _get_processor(var)
            converted_grads_and_vars.append((grad, var, processor))

        converted_grads_and_vars = tuple(converted_grads_and_vars)
        var_list = [var for grad, var, _ in converted_grads_and_vars if grad is not None]
        if not var_list:
            raise ValueError("No gradients provided for any variable: %s." % ([str(var) for _, var, _ in converted_grads_and_vars],))
        with ops.init_scope():
            self._create_slots(var_list)
        update_ops = []
        with ops.name_scope(name, self._name) as name:
            self._prepare()
            for grad, var, processor in converted_grads_and_vars:
                if grad is None:
                    continue
                # We colocate all ops created in _apply_dense or _apply_sparse
                # on the same device as the variable.
                # TODO(apassos): figure out how to get the variable name here.
                if context.executing_eagerly() or isinstance(var, resource_variable_ops.ResourceVariable) and not var._in_graph_mode:
                    scope_name = ""
                else:
                    scope_name = var.op.name
                with ops.name_scope("update_" + scope_name), ops.colocate_with(var):
                    update_ops.append(processor.update_op(self, loss, grad, global_step))
            if global_step is None:
                apply_updates = self._finish(update_ops, loss, name)
            else:
                with ops.control_dependencies([self._finish(update_ops, loss, "update")]):
                    with ops.colocate_with(global_step):
                        if isinstance(global_step, resource_variable_ops.ResourceVariable):
                            # TODO(apassos): the implicit read in assign_add is slow; consider
                            # making it less so.
                            apply_updates = resource_variable_ops.assign_add_variable_op(
                                resource=global_step.handle,
                                value=ops.convert_to_tensor(
                                    value=1,
                                    dtype=global_step.dtype
                                ),
                                name=name
                            )
                        else:
                            apply_updates = state_ops.assign_add(
                                ref=global_step,
                                value=1,
                                name=name
                            )

            if not context.executing_eagerly():
                if isinstance(apply_updates, ops.Tensor):
                    apply_updates = apply_updates.op
                train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
                if apply_updates not in train_op:
                    train_op.append(apply_updates)

            return apply_updates
Esempio n. 22
0
def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
    """Compute the moving average of a variable.

  The moving average of 'variable' updated with 'value' is:
    variable * decay + value * (1 - decay)

  The returned Operation sets 'variable' to the newly computed moving average,
  by performing this subtraction:
     variable -= (1 - decay) * (variable - value)

  Since variables that are initialized to a `0` value will be `0` biased,
  `zero_debias` optionally enables scaling by the mathematically correct
  debiasing factor of
    1 - decay ** num_updates
  See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
  (https://arxiv.org/abs/1412.6980).

  The names of the debias shadow variables, by default, include both the scope
  they were created in and the scope of the variables they debias. They are also
  given a uniquifying-suffix.

  E.g.:

  ```
    with tf.variable_scope('scope1'):
      with tf.variable_scope('scope2'):
        var = tf.get_variable('foo')
        update_1 = tf.assign_moving_average(var, 0.0, 1.0)
        update_2 = tf.assign_moving_average(var, 0.0, 0.9)

    # var.name: 'scope1/scope2/foo'
    # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
    #                   'scope1/scope2/scope1/scope2/foo/biased_1'
  ```

  Args:
    variable: A Variable.
    value: A tensor with the same shape as 'variable'.
    decay: A float Tensor or float value.  The moving average decay.
    zero_debias: A python bool. If true, assume the variable is 0-initialized
      and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
      `_zero_debias` for more details.
    name: Optional name of the returned operation.

  Returns:
    A tensor which if evaluated will compute and return the new moving average.
  """
    def update_fn(v, value, decay=decay):
        decay = ops.convert_to_tensor(1.0 - decay, name="decay")
        if decay.dtype != v.dtype.base_dtype:
            decay = math_ops.cast(decay, v.dtype.base_dtype)
        if zero_debias:
            update_delta = _zero_debias(v, value, decay)
        else:
            update_delta = (v - value) * decay
        return state_ops.assign_sub(v, update_delta, name=scope)

    with ops.name_scope(name, "AssignMovingAvg",
                        [variable, value, decay]) as scope:
        tower_context = distribution_strategy_context.get_tower_context()
        if tower_context:
            # In a tower context, we update variable using the mean of value across
            # towers.
            def merge_fn(strategy, v, value):
                value = strategy.reduce(
                    variable_scope.VariableAggregation.MEAN, value, v)
                return strategy.update(v, update_fn, value)

            return tower_context.merge_call(merge_fn, variable, value)
        else:
            strategy = distribution_strategy_context.get_cross_tower_context()
            return strategy.update(variable, update_fn, value)
Esempio n. 23
0
def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
    """Initializes current variables with tensors loaded from given checkpoint.

  Note: This overrides default initialization ops of specified variables and
  redefines dtype.

  Assignment map supports following syntax:

  * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
    current `scope_name` from `checkpoint_scope_name` with matching tensor
    names.
  * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
    will initialize `scope_name/variable_name` variable
    from `checkpoint_scope_name/some_other_variable`.
  * `'scope_variable_name': variable` - will initialize given `tf.Variable`
    object with tensor 'scope_variable_name' from the checkpoint.
  * `'scope_variable_name': list(variable)` - will initialize list of
    partitioned variables with tensor 'scope_variable_name' from the checkpoint.
  * `'/': 'scope_name/'` - will load all variables in current `scope_name` from
    checkpoint's root (e.g. no scope).

  Supports loading into partitioned variables, which are represented as
  `'<variable>/part_<part #>'`.

  Example:

  ```python

  # Say, '/tmp/model.ckpt' has the following tensors:
  #  -- name='old_scope_1/var1', shape=[20, 2]
  #  -- name='old_scope_1/var2', shape=[50, 4]
  #  -- name='old_scope_2/var3', shape=[100, 100]

  # Create new model's variables
  with tf.variable_scope('new_scope_1'):
    var1 = tf.get_variable('var1', shape=[20, 2],
                           initializer=tf.zeros_initializer())
  with tf.variable_scope('new_scope_2'):
    var2 = tf.get_variable('var2', shape=[50, 4],
                           initializer=tf.zeros_initializer())
    # Partition into 5 variables along the first axis.
    var3 = tf.get_variable(name='var3', shape=[100, 100],
                           initializer=tf.zeros_initializer(),
                           partitioner=lambda shape, dtype: [5, 1])

  # Initialize all variables in `new_scope_1` from `old_scope_1`.
  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'})

  # Use names to specify which variables to initialize from checkpoint.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_1/var1': 'new_scope_1/var1',
                        'old_scope_1/var2': 'new_scope_2/var2'})

  # Or use tf.Variable objects to identify what to initialize.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_1/var1': var1,
                        'old_scope_1/var2': var2})

  # Initialize partitioned variables using variable's name
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_2/var3': 'new_scope_2/var3'})

  # Or specify the list of tf.Variable objects.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_2/var3': var3._get_variable_list()})

  ```

  Args:
    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
    assignment_map: Dict, where keys are names of the variables in the
      checkpoint and values are current variables or names of current variables
      (in default graph).

  Raises:
    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
    ValueError: If missing variables in current graph.
  """
    if distribution_strategy_context.get_cross_tower_context():
        _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
    else:
        distribution_strategy_context.get_tower_context().merge_call(
            _init_from_checkpoint, ckpt_dir_or_file, assignment_map)
Esempio n. 24
0
 def _as_graph_element(self):
   # pylint: disable=protected-access
   if distribution_strategy_context.get_cross_tower_context():
     return self._get_cross_tower()
   return self.get()._as_graph_element()
Esempio n. 25
0
 def _as_graph_element(self):
     # pylint: disable=protected-access
     if distribution_strategy_context.get_cross_tower_context():
         return self._get_cross_tower()
     return self.get()._as_graph_element()
Esempio n. 26
0
def init_from_checkpoint(ckpt_dir_or_file, assignment_map):
  """Initializes current variables with tensors loaded from given checkpoint.

  Note: This overrides default initialization ops of specified variables and
  redefines dtype.

  Assignment map supports following syntax:

  * `'checkpoint_scope_name/': 'scope_name/'` - will load all variables in
    current `scope_name` from `checkpoint_scope_name` with matching tensor
    names.
  * `'checkpoint_scope_name/some_other_variable': 'scope_name/variable_name'` -
    will initialize `scope_name/variable_name` variable
    from `checkpoint_scope_name/some_other_variable`.
  * `'scope_variable_name': variable` - will initialize given `tf.Variable`
    object with tensor 'scope_variable_name' from the checkpoint.
  * `'scope_variable_name': list(variable)` - will initialize list of
    partitioned variables with tensor 'scope_variable_name' from the checkpoint.
  * `'/': 'scope_name/'` - will load all variables in current `scope_name` from
    checkpoint's root (e.g. no scope).

  Supports loading into partitioned variables, which are represented as
  `'<variable>/part_<part #>'`.

  Example:

  ```python

  # Say, '/tmp/model.ckpt' has the following tensors:
  #  -- name='old_scope_1/var1', shape=[20, 2]
  #  -- name='old_scope_1/var2', shape=[50, 4]
  #  -- name='old_scope_2/var3', shape=[100, 100]

  # Create new model's variables
  with tf.variable_scope('new_scope_1'):
    var1 = tf.get_variable('var1', shape=[20, 2],
                           initializer=tf.zeros_initializer())
  with tf.variable_scope('new_scope_2'):
    var2 = tf.get_variable('var2', shape=[50, 4],
                           initializer=tf.zeros_initializer())
    # Partition into 5 variables along the first axis.
    var3 = tf.get_variable(name='var3', shape=[100, 100],
                           initializer=tf.zeros_initializer(),
                           partitioner=lambda shape, dtype: [5, 1])

  # Initialize all variables in `new_scope_1` from `old_scope_1`.
  init_from_checkpoint('/tmp/model.ckpt', {'old_scope_1/': 'new_scope_1'})

  # Use names to specify which variables to initialize from checkpoint.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_1/var1': 'new_scope_1/var1',
                        'old_scope_1/var2': 'new_scope_2/var2'})

  # Or use tf.Variable objects to identify what to initialize.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_1/var1': var1,
                        'old_scope_1/var2': var2})

  # Initialize partitioned variables using variable's name
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_2/var3': 'new_scope_2/var3'})

  # Or specify the list of tf.Variable objects.
  init_from_checkpoint('/tmp/model.ckpt',
                       {'old_scope_2/var3': var3._get_variable_list()})

  ```

  Args:
    ckpt_dir_or_file: Directory with checkpoints file or path to checkpoint.
    assignment_map: Dict, where keys are names of the variables in the
      checkpoint and values are current variables or names of current variables
      (in default graph).

  Raises:
    tf.errors.OpError: If missing checkpoints or tensors in checkpoints.
    ValueError: If missing variables in current graph.
  """
  if distribution_strategy_context.get_cross_tower_context():
    _init_from_checkpoint(None, ckpt_dir_or_file, assignment_map)
  else:
    distribution_strategy_context.get_tower_context().merge_call(
        _init_from_checkpoint, ckpt_dir_or_file, assignment_map)