Example #1
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    This is the second part of `minimize()`. It returns an `Operation` that
    applies gradients.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        `compute_gradients()`.
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the `Optimizer` constructor.

    Returns:
      An `Operation` that applies the specified gradients. If `global_step`
      was not None, that operation also increments `global_step`.

    Raises:
      TypeError: If `grads_and_vars` is malformed.
      ValueError: If none of the variables have gradients.
    """
    # This is a default implementation of apply_gradients() that can be shared
    # by most optimizers.  It relies on the subclass implementing the following
    # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().
    grads_and_vars = tuple(grads_and_vars)  # Make sure repeat iteration works
    for g, v in grads_and_vars:
      if not isinstance(g, (ops.Tensor, ops.IndexedSlices, type(None))):
        raise TypeError(
            "Gradient must be a Tensor, IndexedSlices, or None: %s" % g)
      if not isinstance(v, variables.Variable):
        raise TypeError(
            "Variable must be a tf.Variable: %s" % v)
      if g is not None:
        self._assert_valid_dtypes([g, v])
    var_list = [v for g, v in grads_and_vars if g is not None]
    if not var_list:
      raise ValueError("No gradients provided for any variable: %s" %
                       (grads_and_vars,))
    with ops.control_dependencies(None):
      self._create_slots(var_list)
    update_ops = []
    with ops.op_scope([], name, self._name) as name:
      self._prepare()
      for grad, var in grads_and_vars:
        if grad is None:
          continue
        # We colocate all ops created in _apply_dense or _apply_sparse
        # on the same device as the variable.
        with ops.name_scope("update_" + var.op.name), ops.colocate_with(var):
          if isinstance(grad, ops.Tensor):
            update_ops.append(self._apply_dense(grad, var))
          else:
            update_ops.append(self._apply_sparse(grad, var))
      if global_step is None:
        return self._finish(update_ops, name)
      else:
        with ops.control_dependencies([self._finish(update_ops, "update")]):
          with ops.colocate_with(global_step):
            return state_ops.assign_add(global_step, 1, name=name).op
Example #2
0
 def testMultiColocationGroups(self):
   a = constant_op.constant([2.0], name="a")
   b = constant_op.constant(3.0, name="b")
   with ops.colocate_with(a.op):
     with ops.colocate_with(b.op):
       c = constant_op.constant(4.0)
   self.assertEqual(set([b"loc:@a", b"loc:@b"]), set(c.op.colocation_groups()))
  def _create_variable(self, next_creator, *args, **kwargs):
    if self._num_replicas_in_sync > 1:
      aggregation = kwargs.pop("aggregation", vs.VariableAggregation.NONE)
      if aggregation not in (
          vs.VariableAggregation.NONE,
          vs.VariableAggregation.SUM,
          vs.VariableAggregation.MEAN,
          vs.VariableAggregation.ONLY_FIRST_REPLICA
      ):
        raise ValueError("Invalid variable aggregation mode: " + aggregation +
                         " for variable: " + kwargs["name"])

      def var_creator(*args, **kwargs):
        """Create an AggregatingVariable and fix up collections."""
        # Record what collections this variable should be added to.
        collections = kwargs.pop("collections", None)
        if collections is None:
          collections = [ops.GraphKeys.GLOBAL_VARIABLES]
        kwargs["collections"] = []

        # Create and wrap the variable.
        v = next_creator(*args, **kwargs)
        wrapped = values.AggregatingVariable(
            self._container_strategy(), v, aggregation)

        # Add the wrapped variable to the requested collections.
        # The handling of eager mode and the global step matches
        # ResourceVariable._init_from_args().
        if not context.executing_eagerly():
          g = ops.get_default_graph()
          # If "trainable" is True, next_creator() will add the contained
          # variable to the TRAINABLE_VARIABLES collection, so we manually
          # remove it and replace with the wrapper. We can't set "trainable"
          # to False for next_creator() since that causes functions like
          # implicit_gradients to skip those variables.
          if kwargs.get("trainable", True):
            collections.append(ops.GraphKeys.TRAINABLE_VARIABLES)
            l = g.get_collection_ref(ops.GraphKeys.TRAINABLE_VARIABLES)
            if v in l:
              l.remove(v)
          g.add_to_collections(collections, wrapped)
        elif ops.GraphKeys.GLOBAL_STEP in collections:
          ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, wrapped)

        return wrapped
    else:
      var_creator = next_creator

    if "colocate_with" in kwargs:
      colocate_with = kwargs["colocate_with"]
      if isinstance(colocate_with, numpy_dataset.SingleDevice):
        with ops.device(colocate_with.device):
          return var_creator(*args, **kwargs)
      with ops.device(None):
        with ops.colocate_with(colocate_with):
          return var_creator(*args, **kwargs)

    with ops.colocate_with(None, ignore_existing=True):
      with ops.device(self._variable_device):
        return var_creator(*args, **kwargs)
Example #4
0
 def testColocationIgnoreStack(self):
   a = constant_op.constant([2.0], name="a")
   b = constant_op.constant(3.0, name="b")
   with ops.colocate_with(a.op):
     with ops.colocate_with(b.op, ignore_existing=True):
       c = constant_op.constant(4.0)
   self.assertEqual(set([b"loc:@b"]), set(c.op.colocation_groups()))
Example #5
0
 def _f():
   # Note that there is a race condition here, so we do a best effort
   # updates here. We reset update_in_steps first so that other workers
   # don't duplicate the updates. Also we update cluster_center_vars
   # before resetting total_counts to avoid large updates to
   # cluster_centers_updated based on partially updated
   # cluster_center_vars.
   with ops.control_dependencies([
       state_ops.assign(update_in_steps,
                        self._mini_batch_steps_per_iteration - 1)
   ]):
     with ops.colocate_with(
         cluster_centers_updated, ignore_existing=True):
       if self._distance_metric == COSINE_DISTANCE:
         cluster_centers = nn_impl.l2_normalize(
             cluster_centers_updated, dim=1)
       else:
         cluster_centers = cluster_centers_updated
     with ops.colocate_with(cluster_centers_var, ignore_existing=True):
       with ops.control_dependencies(
           [state_ops.assign(cluster_centers_var, cluster_centers)]):
         with ops.colocate_with(None, ignore_existing=True):
           with ops.control_dependencies([
               state_ops.assign(total_counts,
                                array_ops.zeros_like(total_counts))
           ]):
             return array_ops.identity(update_in_steps)
Example #6
0
  def _full_batch_training_op(self, inputs, cluster_idx_list, cluster_centers):
    """Creates an op for training for full batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
    cluster_sums = []
    cluster_counts = []
    epsilon = constant_op.constant(1e-6, dtype=inputs[0].dtype)
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
      with ops.colocate_with(inp):
        cluster_sums.append(
            math_ops.unsorted_segment_sum(inp, cluster_idx, self._num_clusters))
        cluster_counts.append(
            math_ops.unsorted_segment_sum(
                array_ops.reshape(
                    array_ops.ones(
                        array_ops.reshape(array_ops.shape(inp)[0], [-1])),
                    [-1, 1]), cluster_idx, self._num_clusters))
    with ops.colocate_with(cluster_centers):
      new_clusters_centers = math_ops.add_n(cluster_sums) / (math_ops.cast(
          math_ops.add_n(cluster_counts), cluster_sums[0].dtype) + epsilon)
      if self._clusters_l2_normalized():
        new_clusters_centers = nn_impl.l2_normalize(new_clusters_centers, dim=1)
    return state_ops.assign(cluster_centers, new_clusters_centers)
Example #7
0
  def testColocateWithBeforeCond(self):
    with ops.Graph().as_default() as g:
      with self.session(graph=g):

        a = constant_op.constant([2.0], name="a")
        b = constant_op.constant([2.0], name="b")

        def fn():
          c = constant_op.constant(3.0)
          self.assertEqual([b"loc:@a"], c.op.colocation_groups())
          return c

        with ops.colocate_with(a.op):
          self.assertEquals(
              cond_v2.cond_v2(constant_op.constant(True), fn, fn).eval(), 3)

        def fn2():
          c = constant_op.constant(3.0)
          self.assertEqual([b"loc:@a", b"loc:@b"], c.op.colocation_groups())
          return c

        with ops.colocate_with(a.op):
          with ops.colocate_with(b.op):
            self.assertEquals(
                cond_v2.cond_v2(constant_op.constant(True), fn2, fn2).eval(), 3)
  def _renorm_correction_and_moments(self, mean, variance, training):
    """Returns the correction and update values for renorm."""
    stddev = math_ops.sqrt(variance + self.epsilon)
    # Compute the average mean and standard deviation, as if they were
    # initialized with this batch's moments.
    mixed_renorm_mean = (self.renorm_mean +
                         (1. - self.renorm_mean_weight) * mean)
    mixed_renorm_stddev = (self.renorm_stddev +
                           (1. - self.renorm_stddev_weight) * stddev)
    # Compute the corrections for batch renorm.
    r = stddev / mixed_renorm_stddev
    d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
    # Ensure the corrections use pre-update moving averages.
    with ops.control_dependencies([r, d]):
      mean = array_ops.identity(mean)
      stddev = array_ops.identity(stddev)
    rmin, rmax, dmax = [self.renorm_clipping.get(key)
                        for key in ['rmin', 'rmax', 'dmax']]
    if rmin is not None:
      r = math_ops.maximum(r, rmin)
    if rmax is not None:
      r = math_ops.minimum(r, rmax)
    if dmax is not None:
      d = math_ops.maximum(d, -dmax)
      d = math_ops.minimum(d, dmax)
    # When not training, use r=1, d=0, and decay=1 meaning no updates.
    r = _smart_select(training, lambda: r, lambda: array_ops.ones_like(r))
    d = _smart_select(training, lambda: d, lambda: array_ops.zeros_like(d))
    decay = _smart_select(training, lambda: self.renorm_momentum, lambda: 1.)

    def _update_renorm_variable(var, weight, value):
      """Updates a moving average and weight, returns the unbiased value."""
      # Update the variables without zero debiasing. The debiasing will be
      # accomplished by dividing the exponential moving average by the weight.
      # For example, after a single update, the moving average would be
      # (1-decay) * value. and the weight will be 1-decay, with their ratio
      # giving value.
      # Make sure the weight is not updated until before r and d computation.
      value = array_ops.identity(value)
      with ops.control_dependencies([value]):
        weight_value = array_ops.constant(1., dtype=weight.dtype)
      new_var = moving_averages.assign_moving_average(
          var, value, decay, zero_debias=False)
      new_weight = moving_averages.assign_moving_average(
          weight, weight_value, decay, zero_debias=False)
      return new_var / new_weight

    with ops.colocate_with(self.moving_mean):
      new_mean = _update_renorm_variable(self.renorm_mean,
                                         self.renorm_mean_weight,
                                         mean)
    with ops.colocate_with(self.moving_variance):
      new_stddev = _update_renorm_variable(self.renorm_stddev,
                                           self.renorm_stddev_weight,
                                           stddev)
      # Make sqrt(moving_variance + epsilon) = new_stddev.
      new_variance = math_ops.square(new_stddev) - self.epsilon

    return (r, d, new_mean, new_variance)
Example #9
0
def batch_norm(input_,
               dim,
               name,
               scale=True,
               train=True,
               epsilon=1e-8,
               decay=.1,
               axes=[0],
               bn_lag=DEFAULT_BN_LAG):
    """Batch normalization."""
    # create variables
    with tf.variable_scope(name):
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
        if scale:
            gamma = variable_on_cpu("gamma", [dim], tf.constant_initializer(1.))
        beta = variable_on_cpu("beta", [dim], tf.constant_initializer(0.))
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_mean -= (1. - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # normalize
    res = (input_ - used_mean) / tf.sqrt(used_var + epsilon)
    # de-normalize
    if scale:
        res *= gamma
    res += beta

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        res += 0. * new_mean * new_var * new_step

    return res
Example #10
0
 def testNestedColocateWith(self):
   a = constant_op.constant([2.0], name="a")
   with ops.colocate_with(a.op):
     b = constant_op.constant(3.0)
     with ops.colocate_with(b.op):
       c = constant_op.constant(4.0)
   self.assertEqual([b"loc:@a"], b.op.colocation_groups())
   self.assertEqual([b"loc:@a"], c.op.colocation_groups())
Example #11
0
  def add_variable(self, name, shape, dtype=None,
                   initializer=None, regularizer=None, trainable=True):
    """Adds a new variable to the layer, or gets an existing one; returns it.

    Arguments:
      name: variable name.
      shape: variable shape.
      dtype: The type of the variable. Defaults to `self.dtype`.
      initializer: initializer instance (callable).
      regularizer: regularizer instance (callable).
      trainable: whether the variable should be part of the layer's
        "trainable_variables" (e.g. variables, biases)
        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).

    Returns:
      The created variable.
    """
    if dtype is None:
      dtype = self.dtype
    existing_variables = set(tf_variables.global_variables())

    self._set_scope(None)

    with vs.variable_scope(self._scope,
                           reuse=self.built or self._reuse) as scope:
      with ops.name_scope(scope.original_name_scope):
        variable = vs.get_variable(name,
                                   shape=shape,
                                   initializer=initializer,
                                   dtype=dtypes.as_dtype(dtype),
                                   trainable=trainable and self.trainable)
        if variable in existing_variables:
          return variable
        if regularizer:
          # To match the behavior of tf.get_variable(), we only
          # apply regularization if the variable is newly created.
          if isinstance(variable, tf_variables.PartitionedVariable):
            for v in variable:
              with ops.colocate_with(v.op):
                with ops.name_scope(name + '/Regularizer'):
                  regularization = regularizer(v)
              if regularization is not None:
                self.add_loss(regularization)
                _add_elements_to_collection(
                    regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
          else:
            with ops.colocate_with(variable.op):
              with ops.name_scope(name + '/Regularizer'):
                regularization = regularizer(variable)
            if regularization is not None:
              self.add_loss(regularization)
              _add_elements_to_collection(
                  regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
    if trainable:
      self._trainable_weights.append(variable)
    else:
      self._non_trainable_weights.append(variable)
    return variable
Example #12
0
def batch_norm_log_diff(input_,
                        dim,
                        name,
                        train=True,
                        epsilon=1e-8,
                        decay=.1,
                        axes=[0],
                        reuse=None,
                        bn_lag=DEFAULT_BN_LAG):
    """Batch normalization with corresponding log determinant Jacobian."""
    if reuse is None:
        reuse = not train
    # create variables
    with tf.variable_scope(name) as scope:
        if reuse:
            scope.reuse_variables()
        var = variable_on_cpu(
            "var", [dim], tf.constant_initializer(1.), trainable=False)
        mean = variable_on_cpu(
            "mean", [dim], tf.constant_initializer(0.), trainable=False)
        step = variable_on_cpu("step", [], tf.constant_initializer(0.), trainable=False)
    # choose the appropriate moments
    if train:
        used_mean, used_var = tf.nn.moments(input_, axes, name="batch_norm")
        cur_mean, cur_var = used_mean, used_var
        if bn_lag > 0.:
            used_var = stable_var(input_=input_, mean=used_mean, axes=axes)
            cur_var = used_var
            used_mean -= (1 - bn_lag) * (used_mean - tf.stop_gradient(mean))
            used_mean /= (1. - bn_lag**(step + 1))
            used_var -= (1 - bn_lag) * (used_var - tf.stop_gradient(var))
            used_var /= (1. - bn_lag**(step + 1))
    else:
        used_mean, used_var = mean, var
        cur_mean, cur_var = used_mean, used_var

    # update variables
    if train:
        with tf.name_scope(name, "AssignMovingAvg", [mean, cur_mean, decay]):
            with ops.colocate_with(mean):
                new_mean = tf.assign_sub(
                    mean,
                    tf.check_numerics(
                        decay * (mean - cur_mean), "NaN in moving mean."))
        with tf.name_scope(name, "AssignMovingAvg", [var, cur_var, decay]):
            with ops.colocate_with(var):
                new_var = tf.assign_sub(
                    var,
                    tf.check_numerics(decay * (var - cur_var),
                                      "NaN in moving variance."))
        with tf.name_scope(name, "IncrementTime", [step]):
            with ops.colocate_with(step):
                new_step = tf.assign_add(step, 1.)
        used_var += 0. * new_mean * new_var * new_step
    used_var += epsilon

    return used_mean, used_var
  def _create_variable(self, next_creator, *args, **kwargs):
    if "colocate_with" in kwargs:
      with ops.device(None):
        with ops.colocate_with(kwargs["colocate_with"]):
          return next_creator(*args, **kwargs)

    with ops.colocate_with(None, ignore_existing=True):
      with ops.device(self._variable_device):
        return next_creator(*args, **kwargs)
Example #14
0
    def _add_variable(
        self,
        name,
        shape,
        dtype=None,
        initializer=None,
        regularizer=None,
        trainable=True,
        variable_getter=vs.get_variable,
    ):
        """Adds a new variable to the layer.

    Arguments:
      name: variable name.
      shape: variable shape.
      dtype: The type of the variable. Defaults to `self.dtype`.
      initializer: initializer instance (callable).
      regularizer: regularizer instance (callable).
      trainable: whether the variable should be part of the layer's
        "trainable_variables" (e.g. variables, biases)
        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).
      variable_getter: The getter to use for TensorFlow variables.

    Returns:
      The created variable.
    """
        if dtype is None:
            dtype = self.dtype
        existing_variables = set(tf_variables.global_variables())
        variable = variable_getter(
            name, shape=shape, initializer=initializer, dtype=dtype, trainable=trainable and self.trainable
        )
        # TODO(sguada) fix name = variable.op.name
        if regularizer:
            if not self._reuse and variable not in existing_variables:
                # To match the behavior of tf.get_variable(), we only
                # apply regularization if the variable is newly created.
                if isinstance(variable, tf_variables.PartitionedVariable):
                    for v in variable:
                        with ops.colocate_with(v.op):
                            with ops.name_scope(name + "/Regularizer"):
                                regularization = regularizer(v)
                        if regularization is not None:
                            self._losses.append(regularization)
                            _add_elements_to_collection(regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
                else:
                    with ops.colocate_with(variable.op):
                        with ops.name_scope(name + "/Regularizer"):
                            regularization = regularizer(variable)
                    if regularization is not None:
                        self._losses.append(regularization)
                        _add_elements_to_collection(regularization, ops.GraphKeys.REGULARIZATION_LOSSES)
        if trainable:
            self._trainable_variables.append(variable)
        else:
            self._non_trainable_variables.append(variable)
        return variable
Example #15
0
def maybe_colocate_with(op):
  """Context to colocate with `op` if `COLOCATE_COV_OPS_WITH_INPUTS`."""
  if COLOCATE_COV_OPS_WITH_INPUTS:
    if isinstance(op, (list, tuple)):
      with tf_ops.colocate_with(op[0]):
        yield
    else:
      with tf_ops.colocate_with(op):
        yield
  else:
    yield
Example #16
0
def _maybe_colocate_with(op, colocate_cov_ops_with_inputs):
  """Context to colocate with `op` if `colocate_cov_ops_with_inputs`."""
  if colocate_cov_ops_with_inputs:
    if isinstance(op, (list, tuple)):
      with tf_ops.colocate_with(op[0]):
        yield
    else:
      with tf_ops.colocate_with(op):
        yield
  else:
    yield
Example #17
0
    def create_axis_ops(sp_input, num_items, update_fn, axis_name):
      """Creates book-keeping and training ops for a given axis.

      Args:
        sp_input: A SparseTensor corresponding to the row or column batch.
        num_items: An integer, the total number of items of this axis.
        update_fn: A function that takes one argument (`sp_input`), and that
        returns a tuple of
          * new_factors: A flot Tensor of the factor values after update.
          * update_op: a TensorFlow op which updates the factors.
          * loss: A float Tensor, the unregularized loss.
          * reg_loss: A float Tensor, the regularization loss.
          * sum_weights: A float Tensor, the sum of factor weights.
        axis_name: A string that specifies the name of the axis.

      Returns:
        A tuple consisting of:
          * reset_processed_items_op: A TensorFlow op, to be run before the
            beginning of any sweep. It marks all items as not-processed.
          * axis_train_op: A Tensorflow op, to be run during this axis' sweeps.
      """
      processed_items_init = array_ops.fill(dims=[num_items], value=False)
      with ops.colocate_with(processed_items_init):
        processed_items = variable_scope.variable(
            processed_items_init,
            collections=[ops.GraphKeys.GLOBAL_VARIABLES],
            trainable=False,
            name="processed_" + axis_name)
      reset_processed_items_op = state_ops.assign(
          processed_items, processed_items_init,
          name="reset_processed_" + axis_name)
      _, update_op, loss, reg, sum_weights = update_fn(sp_input)
      input_indices = sp_input.indices[:, 0]
      with ops.control_dependencies([
          update_op,
          state_ops.assign(loss_var, loss + reg),
          state_ops.assign(rwse_var, math_ops.sqrt(loss / sum_weights))]):
        with ops.colocate_with(processed_items):
          update_processed_items = state_ops.scatter_update(
              processed_items,
              input_indices,
              array_ops.ones_like(input_indices, dtype=dtypes.bool),
              name="update_processed_{}_indices".format(axis_name))
        with ops.control_dependencies([update_processed_items]):
          is_sweep_done = math_ops.reduce_all(processed_items)
          axis_train_op = control_flow_ops.group(
              global_step_incr_op,
              state_ops.assign(is_sweep_done_var, is_sweep_done),
              state_ops.assign_add(
                  completed_sweeps_var,
                  math_ops.cast(is_sweep_done, dtypes.int32)),
              name="{}_sweep_train_op".format(axis_name))
      return reset_processed_items_op, axis_train_op
Example #18
0
  def _cached_copy(self, var, name, pass_through=False):
    """Helper function to create a worker cached copy of a Variable.

    This assigns the var (either a single Variable or a list of Variables) to
    local transient cache Variable(s). Note that if var is a list of Variables,
    the assignment is done sequentially to minimize the memory overheads.
    Also note that if pass_through is set to True, this does not create new
    Variables but simply return the input back.

    Args:
      var: A Variable or a list of Variables to cache.
      name: name of cached Variable.
      pass_through: when set to True, this simply pass through the var back
        through identity operator and does not actually creates a cache.

    Returns:
      Tuple consisting of following three entries:
      cache: the new transient Variable or list of transient Variables
        corresponding one-to-one with var.
      cache_init: op to initialize the Variable or the list of Variables.
      cache_reset: op to reset the Variable or the list of Variables to some
        default value.
    """
    if var is None:
      return None, None, None
    elif pass_through:
      cache = var
      cache_init = control_flow_ops.no_op()
      cache_reset = control_flow_ops.no_op()
    elif isinstance(var, variables.Variable):
      cache = WALSModel._transient_var(name=name)
      with ops.colocate_with(cache):
        cache_init = state_ops.assign(cache, var, validate_shape=False)
        cache_reset = state_ops.assign(cache, 1.0, validate_shape=False)
    else:
      assert isinstance(var, list)
      assert var
      cache = [
          WALSModel._transient_var(name="%s_shard_%d" % (name, i))
          for i in xrange(len(var))
      ]
      reset_ops = []
      for i, c in enumerate(cache):
        with ops.colocate_with(c):
          if i == 0:
            cache_init = state_ops.assign(c, var[i], validate_shape=False)
          else:
            with ops.control_dependencies([cache_init]):
              cache_init = state_ops.assign(c, var[i], validate_shape=False)
          reset_ops.append(state_ops.assign(c, 1.0, validate_shape=False))
      cache_reset = control_flow_ops.group(*reset_ops)

    return cache, cache_init, cache_reset
Example #19
0
def _GatherGrad(op, grad):
  """Gradient for Gather op."""
  params = op.inputs[0]
  indices = op.inputs[1]
  # Colocate shape computations with params and indices
  with ops.colocate_with(params):
    params_shape = array_ops.shape(params)
  with ops.colocate_with(indices):
    size = array_ops.expand_dims(array_ops.size(indices), 0)
  values_shape = array_ops.concat(0, [size, params_shape[1:]])
  values = array_ops.reshape(grad, values_shape)
  indices = array_ops.reshape(indices, size)
  return [ops.IndexedSlices(values, indices, params_shape), None]
Example #20
0
  def _initialize_variables(self, data, initial_means=None):
    """Initializes variables.

    Args:
      data: a list of Tensors with data, each row is a new example.
      initial_means: a Tensor with a matrix of means.
    """
    first_shard = data[0]
    # Initialize means: num_classes X 1 X dimensions.
    if initial_means is not None:
      means = array_ops.expand_dims(initial_means, 1)
    else:
      # Sample data randomly
      means = array_ops.expand_dims(
          _init_clusters_random(data, self._num_classes, self._random_seed), 1)

    # Initialize covariances.
    if self._covariance_type == FULL_COVARIANCE:
      cov = _covariance(first_shard, False) + self._min_var
      # A matrix per class, num_classes X dimensions X dimensions
      covs = array_ops.tile(
          array_ops.expand_dims(cov, 0), [self._num_classes, 1, 1])
    elif self._covariance_type == DIAG_COVARIANCE:
      cov = _covariance(first_shard, True) + self._min_var
      # A diagonal per row, num_classes X dimensions.
      covs = array_ops.tile(
          array_ops.expand_dims(array_ops.diag_part(cov), 0),
          [self._num_classes, 1])

    with ops.colocate_with(self._cluster_centers_initialized):
      initialized = control_flow_ops.with_dependencies(
          [means, covs],
          array_ops.identity(self._cluster_centers_initialized))
    self._init_ops = []
    with ops.colocate_with(self._means):
      init_means = state_ops.assign(self._means, means, validate_shape=False)
      init_means = control_flow_ops.with_dependencies(
          [init_means],
          state_ops.assign(self._cluster_centers_initialized, True))
      self._init_ops.append(control_flow_ops.cond(initialized,
                                                  control_flow_ops.no_op,
                                                  lambda: init_means).op)
    with ops.colocate_with(self._covs):
      init_covs = state_ops.assign(self._covs, covs, validate_shape=False)
      init_covs = control_flow_ops.with_dependencies(
          [init_covs],
          state_ops.assign(self._cluster_centers_initialized, True))
      self._init_ops.append(control_flow_ops.cond(initialized,
                                                  control_flow_ops.no_op,
                                                  lambda: init_covs).op)
Example #21
0
 def _train_op_fn(loss):
   """Returns the op to optimize the loss."""
   update_op = gbdt_model_main.train(loss, predictions_dict, labels)
   with ops.control_dependencies(
       [update_op]), (ops.colocate_with(global_step)):
     update_op = state_ops.assign_add(global_step, 1).op
     return update_op
Example #22
0
def init_variable(v, init, name="init"):
  """Initializes variable with "init".

  This op does the following:
  if init is a Tensor, v = init
  if callable(init): v = init(VariableShape(v), v.dtype)

  Args:
    v: Variable to initialize
    init: Tensor to assign to v,
      Or an object convertible to Tensor e.g. nparray,
      Or an Initializer that generates a tensor given the shape and type of v.
      An "Initializer" is a callable that returns a tensor that "v" should be
      set to. It will be called as init(shape, dtype).
    name: Optional name for the op.

  Returns:
    The operation that initializes v.
  """
  with ops.name_scope(None, v.op.name + "/", [v, init]):
    with ops.name_scope(name) as scope:
      with ops.colocate_with(v):
        if callable(init):
          assert v.get_shape().is_fully_defined(), "Variable shape unknown."
          # TODO(mrry): Convert to v.shape when the property and
          # accessor are reconciled (and all initializers support
          # tf.TensorShape objects).
          value = init(v.get_shape().as_list(), v.dtype.base_dtype)
          value = ops.convert_to_tensor(value, name="value")
          return gen_state_ops.assign(v, value, name=scope)
        else:
          init = ops.convert_to_tensor(init, name="init")
          return gen_state_ops.assign(v, init, name=scope)
Example #23
0
 def _l2_normalize_data(cls, inputs):
   """Normalized the input data."""
   output = []
   for inp in inputs:
     with ops.colocate_with(inp):
       output.append(nn_impl.l2_normalize(inp, dim=1))
   return output
def _maybe_colocate_with(op, colocate_gradients_with_ops):
  """Context to colocate with `op` if `colocate_gradients_with_ops`."""
  if colocate_gradients_with_ops:
    with ops.colocate_with(op):
      yield
  else:
    yield
Example #25
0
  def _compute_euclidean_distance(cls, inputs, clusters):
    """Computes Euclidean distance between each input and each cluster center.

    Args:
      inputs: list of input Tensors.
      clusters: cluster Tensor.

    Returns:
      list of Tensors, where each element corresponds to each element in inputs.
      The value is the distance of each row to all the cluster centers.
    """
    output = []
    for inp in inputs:
      with ops.colocate_with(inp, ignore_existing=True):
        # Computes Euclidean distance. Note the first and third terms are
        # broadcast additions.
        squared_distance = (
            math_ops.reduce_sum(math_ops.square(inp), 1, keep_dims=True) -
            2 * math_ops.matmul(inp, clusters, transpose_b=True) +
            array_ops.transpose(
                math_ops.reduce_sum(
                    math_ops.square(clusters), 1, keep_dims=True)))
        output.append(squared_distance)

    return output
Example #26
0
def build_nccl_all_reduce(input_tensors, red_op, un_op=None):
  """Build a subgraph that does one full all-reduce, using NCCL.

  Args:
    input_tensors: list of T `tf.Tensor` of same-shape and type values to
      be reduced.
    red_op: binary elementwise reduction operator.  Must be one of
      {tf.add}
    un_op: optional unary elementwise Op to apply to fully-reduce values.

  Returns:
    list of T `tf.Tensor` of reduced values.

  Raises:
    ValueError: red_op not supported.
  """
  if red_op == math_ops.add:
    output_tensors = nccl_ops.all_sum(input_tensors)
  else:
    raise ValueError("red_op not supported by NCCL all-reduce: ", red_op)
  if un_op:
    un_op_wrapped = []
    for t in output_tensors:
      with ops.colocate_with(t):
        un_op_wrapped.append(un_op(t))
    output_tensors = un_op_wrapped
  return output_tensors
Example #27
0
  def apply_gradients(self, grads_and_vars, global_step=None, name=None):
    """Apply gradients to variables.

    Calls tpu_ops.cross_replica_sum() to sum gradient contributions across
    replicas, and then applies the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      An `Operation` that applies the gradients. If `global_step` was not None,
      that operation also increments `global_step`.

    Raises:
      ValueError: If the grads_and_vars is malformed.
    """
    summed_grads_and_vars = []
    for (grad, var) in grads_and_vars:
      if grad is None:
        summed_grads_and_vars.append((grad, var))
      else:
        with ops.colocate_with(grad):
          summed_grads_and_vars.append((tpu_ops.cross_replica_sum(
              grad, self._group_assignment), var))
    return self._opt.apply_gradients(summed_grads_and_vars, global_step, name)
Example #28
0
def _SumGrad(op, grad):
  """Gradient for Sum."""
  # Fast path for when reducing to a scalar and ndims is known: adds only
  # Reshape and Tile ops (and possibly a Shape).
  input_0_shape = op.inputs[0]._shape_tuple()  # pylint: disable=protected-access
  if input_0_shape is not None:
    axes = tensor_util.constant_value(op.inputs[1])
    if axes is not None:
      rank = len(input_0_shape)
      if np.array_equal(axes, np.arange(rank)):  # Reduce all dims.
        grad = array_ops.reshape(grad, [1] * rank)
        # If shape is not fully defined (but rank is), we use Shape.
        if None not in input_0_shape:
          input_shape = input_0_shape
        else:
          input_shape = array_ops.shape(op.inputs[0])
        return [array_ops.tile(grad, input_shape), None]

  input_shape = array_ops.shape(op.inputs[0])
  # TODO(apassos) remove this once device placement for eager ops makes more
  # sense.
  with ops.colocate_with(input_shape):
    output_shape_kept_dims = math_ops.reduced_shape(input_shape, op.inputs[1])
    tile_scaling = _safe_shape_div(input_shape, output_shape_kept_dims)
  grad = array_ops.reshape(grad, output_shape_kept_dims)
  return [array_ops.tile(grad, tile_scaling), None]
Example #29
0
def _build_shuffle_gather(input_tensors, gather_devices, red_op, un_op=None):
  """Construct the gather (concentrate and reduce) phase of shuffle all-reduce.

  Args:
    input_tensors: list of T @(tf.Tensor} values to be reduced.
    gather_devices: list of names of devices on which reduction shards
      should be placed.
    red_op: the binary reduction Op
    un_op: optional elementwise unary Op to be applied to fully-reduced values.

  Returns:
    list of T `tf.Tensor` which are the fully reduced shards.

  Raises:
    ValueError: inputs not well-formed.
  """
  num_source_devices = len(input_tensors)
  num_gather_devices = len(gather_devices)
  shape = input_tensors[0].shape
  if len(shape) != 1:
    raise ValueError("input_tensors must be 1D")
  shards_by_source = []
  for d in range(0, num_source_devices):
    with ops.colocate_with(input_tensors[d]):
      shards_by_source.append(
          _ragged_split(input_tensors[d], num_gather_devices))
  reduced_shards = []
  for d in range(0, num_gather_devices):
    with ops.device(gather_devices[d]):
      values = [s[d] for s in shards_by_source]
      red_shard = red_op(values)
      if un_op:
        red_shard = un_op(red_shard)
      reduced_shards.append(red_shard)
  return reduced_shards
Example #30
0
def _flatten_tensors(tensors):
  """Check tensors for isomorphism and flatten.

  Args:
    tensors: list of T `tf.Tensor` which must all have the same shape.

  Returns:
    tensors: a list of T `tf.Tensor` which are flattened (1D) views of tensors
    shape: the original shape of each element of input tensors

  Raises:
    ValueError: tensors are empty or non-isomorphic or have unknown shape.
  """
  if not tensors:
    raise ValueError("tensors cannot be empty")
  shape = tensors[0].shape
  for tensor in tensors:
    shape = shape.merge_with(tensor.shape)
  if not shape.is_fully_defined():
    raise ValueError("Tensors must have statically known shape.")
  if len(shape) != 1:
    reshaped = []
    for t in tensors:
      with ops.colocate_with(t):
        reshaped.append(array_ops.reshape(t, [-1]))
    tensors = reshaped
  return tensors, shape
  def __init__(
      self,
      key_dtype=dtypes.int64,
      value_dtype=dtypes.float32,
      dim=1,
      devices=None,
      partitioner=default_partition_fn,
      shared_name=None,
      name="DynamicEmbedding_Variable",
      initializer=None,
      trainable=True,
      checkpoint=True,
      init_size=0,
      kv_creator=None,
      restrict_policy=None,
      bp_v2=False,
  ):
    """Creates an empty `Variable` object.

        Creates a group of tables placed on devices specified by `devices`,
        and the device placement mechanism of TensorFlow will be ignored,
        the type of its keys and values are specified by key_dtype
        and value_dtype, respectively.
        The environment variables 'TF_HASHTABLE_INIT_SIZE' can be used to set the
        inital size of each tables, which can help reduce rehash times.
        The default initial table size is 8,192

        Args:
          key_dtype: the type of the key tensors.
          value_dtype: the type of the value tensors.
          dim: the length of the value array for each key,
            on GPUs, `dim` should be less or equal to 200.
          devices: the list of devices holding the tables.
            One table will be created on each device. By default, `devices` is
            ['/CPU:0'] and when GPU is available, `devices` is ['/GPU:0']
          partitioner: partition function of keys,
            return the partition index for each key.

          Example partition func:
          ```python
          def default_partition_fn(keys, shard_num):
            return tf.cast(keys % shard_num, dtype=tf.int32)
          ```
          shared_name: No used.
          name: A name for the operation (optional).
          initializer: The value to use if a key is missing in the hash table.
            which can be a python number, numpy array or `tf.initializer` instances.
            If initializer is `None` (the default), `0` will be taken.
          trainable: Bool. If true, the variable will be treated as a trainable.
            Default is true.
          checkpoint: if True, the contents of the SparseVariable are
            saved to and restored from checkpoints.
            If `shared_name` is empty for a checkpointed table,
            it is shared using the table node name.
          init_size: initial size for the Variable and initial size of each hash
            tables will be int(init_size / N), N is the number of the devices.
          restrict_policy: a restrict policy to specify the rule to restrict the
            size of variable. If in training program, the variable is updated by
            optimizer, then the sparse slot variables in optimizer are also be
            restricted.
          bp_v2: By default with `bp_v2=False`, the optimizer will update
            dynamic embedding values by *setting* (key, value) after
            `optimizer.apply_gradient`. If one key is used by multiple workers
            at the same time, only one of them will be seen, while the others are
            overwritten. By setting `bp_v2=True`, the optimizer will update
            parameters by *adding delta* instead of *setting*, which solves the
            race condition problem among workers during backpropagation in
            large-scale distributed asynchronous training.

        Returns:
          A `Variable` object.
    """
    self.key_dtype = key_dtype
    self.value_dtype = value_dtype
    self.dim = dim
    self.bp_v2 = bp_v2

    def _get_default_devices():
      gpu_list = [
          x.name
          for x in device_lib.list_local_devices()
          if x.device_type == "GPU"
      ]
      return gpu_list[0:1] or [
          "/CPU:0",
      ]

    devices_ = devices or _get_default_devices()
    self.devices = (devices_ if isinstance(devices_, list) else [
        devices,
    ])
    self.partition_fn = partitioner
    self.name = name
    self.shared_name = shared_name or "shared_name.{}".format(name)

    self.initializer = None

    self.trainable = trainable
    self.checkpoint = checkpoint

    self._tables = data_structures.ListWrapper([])
    self._track_trackable(self._tables,
                          'tables_of_{}'.format(self.name),
                          overwrite=True)
    self.size_ops = []
    self._trainable_store = {}
    self.kv_creator = kv_creator if kv_creator else de.CuckooHashTableCreator()

    self.shard_num = len(self.devices)

    self.init_size = int(init_size)

    if restrict_policy is not None:
      if not issubclass(restrict_policy, de.RestrictPolicy):
        raise TypeError('restrict_policy must be subclass of RestrictPolicy.')
      self._restrict_policy = restrict_policy(self)
    else:
      self._restrict_policy = None

    valid_dtype_list = [[dtypes.int64, dtypes.float32],
                        [dtypes.int64,
                         dtypes.half], [dtypes.int64, dtypes.int32],
                        [dtypes.int64, dtypes.int8],
                        [dtypes.int64, dtypes.int64],
                        [dtypes.int64, dtypes.float64],
                        [dtypes.int64, dtypes.string],
                        [dtypes.int32, dtypes.float32],
                        [dtypes.int32, dtypes.int32],
                        [dtypes.int32, dtypes.float64],
                        [dtypes.string, dtypes.float32],
                        [dtypes.string, dtypes.half],
                        [dtypes.string, dtypes.int32],
                        [dtypes.string, dtypes.int8],
                        [dtypes.string, dtypes.int64],
                        [dtypes.string, dtypes.float64],
                        [dtypes.string, dtypes.bool]]
    if "GPU" in self.devices[0].upper():
      valid_dtype_list = [
          [dtypes.int64, dtypes.float32],
          [dtypes.int64, dtypes.half],
          [dtypes.int64, dtypes.int32],
          [dtypes.int64, dtypes.int8],
          [dtypes.int64, dtypes.int64],
          [dtypes.int32, dtypes.float32],
      ]
    if is_macos() and is_arm64():
      if value_dtype == dtypes.half:
        raise TypeError("""
          float16 value dtype is not supported on macOS with ARM64 architecture. Please try another type.
          """)
    if [key_dtype, value_dtype] not in valid_dtype_list:
      raise TypeError(
          "key-value dtype ({}-{}) is not support! The valid dtypes are \n{}\n".
          format(key_dtype, value_dtype, valid_dtype_list))

    _initializer = initializer
    if _initializer is None:
      _initializer = init_ops.zeros_initializer(dtype=self.value_dtype)
    static_default_value = self._convert_anything_to_init(_initializer, dim)
    scope_name = self.name.split("/")[-1]
    with ops.name_scope(scope_name, "DynamicEmbedding_Variable"):
      with ops.colocate_with(None, ignore_existing=True):
        for idx in range(len(self.devices)):
          with ops.device(self.devices[idx]):
            mht = None
            if not issubclass(self.kv_creator.__class__, de.KVCreator):
              raise TypeError("config should be instance of 'config', but got ",
                              str(type(self.kv_creator)))
            mht = self.kv_creator.create(
                key_dtype=self.key_dtype,
                value_dtype=self.value_dtype,
                default_value=static_default_value,
                name=self._make_name(idx),
                checkpoint=self.checkpoint,
                init_size=int(self.init_size / self.shard_num),
            )
            self._tables.append(mht)
Example #32
0
def _GatherV2Grad(op, grad):
    """Gradient for GatherV2 op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.cast(params_shape, dtypes.int32)

    indices = op.inputs[1]
    indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
    axis = op.inputs[2]
    axis_static = tensor_util.constant_value(axis)
    batch_dims = int(op.get_attr("batch_dims"))

    if batch_dims < 0:
        batch_dims += indices.shape.ndims

    # For axis 0 gathers, build an appropriately shaped IndexedSlices.
    if axis_static == 0:
        if context.executing_eagerly():
            params_tail_shape = params_shape.cpu()[1:]
        else:
            params_tail_shape = params_shape[1:]
        values_shape = array_ops.concat([indices_size, params_tail_shape], 0)
        values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad),
                                   values_shape)
        indices = array_ops.reshape(indices, indices_size)
        params_grad = ops.IndexedSlices(values, indices, params_shape)
    else:
        # Handle axis by transposing the axis dimension to be the first non-batch
        # dimension, compute the gradiend and transpose the result back.
        outer_shape = params_shape[:axis]
        inner_shape = params_shape[axis:][1:]
        values_shape = array_ops.concat([outer_shape, [-1], inner_shape], 0)

        values_dims = array_ops.size(values_shape)
        axis_dims = array_ops.size(outer_shape)

        outer_batches_indices = math_ops.range(batch_dims)
        batch_axis_indices = math_ops.range(batch_dims, axis_dims)
        inner_axes_indices = math_ops.range(axis_dims + 1, values_dims)

        values = array_ops.reshape(_IndexedSlicesToTensorNoWarning(grad),
                                   values_shape)

        # Move values[axis] up to values[batch_dims]
        transpose_dims = array_ops.concat([
            outer_batches_indices, [axis_dims], batch_axis_indices,
            inner_axes_indices
        ], 0)
        values_transpose = array_ops.transpose(values, transpose_dims)

        params_grad = _BatchGatherGrad(params_shape, values_transpose, indices,
                                       batch_dims, params_shape[axis])

        # Inverts the above transpose by moving dimension batch_dims back to its
        # original position.
        invert_transpose_dims = array_ops.concat([
            outer_batches_indices, batch_axis_indices + 1, [batch_dims],
            inner_axes_indices
        ], 0)
        params_grad = array_ops.transpose(params_grad, invert_transpose_dims)

    return [params_grad, None, None]
    def _init_from_args(self,
                        initial_value=None,
                        trainable=True,
                        collections=None,
                        validate_shape=True,
                        caching_device=None,
                        name=None,
                        dtype=None,
                        constraint=None):
        """Creates a variable.

    Args:
      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
        which is the initial value for the Variable. The initial value must have
        a shape specified unless `validate_shape` is set to False. Can also be a
        callable with no argument that returns the initial value when called.
        (Note that initializer functions from init_ops.py must first be bound
         to a shape before being used here.)
      trainable: If `True`, the default, also adds the variable to the graph
        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
        the default list of variables to use by the `Optimizer` classes.
      collections: List of graph collections keys. The new variable is added to
        these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
      validate_shape: Ignored. Provided for compatibility with tf.Variable.
      caching_device: Optional device string or function describing where the
        Variable should be cached for reading.  Defaults to the Variable's
        device.  If not `None`, caches on another device.  Typical use is to
        cache on the device where the Ops using the Variable reside, to
        deduplicate copying through `Switch` and other conditional statements.
      name: Optional name for the variable. Defaults to `'Variable'` and gets
        uniquified automatically.
      dtype: If set, initial_value will be converted to the given type.
        If None, either the datatype will be kept (if initial_value is
       a Tensor) or float32 will be used (if it is a Python object convertible
       to a Tensor).
      constraint: An optional projection function to be applied to the variable
        after being updated by an `Optimizer` (e.g. used to implement norm
        constraints or value constraints for layer weights). The function must
        take as input the unprojected Tensor representing the value of the
        variable and return the Tensor for the projected value
        (which must have the same shape). Constraints are not safe to
        use when doing asynchronous distributed training.

    Raises:
      ValueError: If the initial value is not specified, or does not have a
        shape and `validate_shape` is `True`.

    @compatibility(eager)
    When Eager Execution is enabled, variables are never added to collections.
    It is not implicitly added to the GLOBAL_VARIABLES or TRAINABLE_VARIABLES
    collections, and the `collections` argument is ignored.
    @end_compatibility
    """
        if initial_value is None:
            raise ValueError("initial_value must be specified.")
        init_from_fn = callable(initial_value)

        if collections is None:
            collections = [ops.GraphKeys.GLOBAL_VARIABLES]
        if not isinstance(collections, (list, tuple, set)):
            raise ValueError(
                "collections argument to Variable constructor must be a list, tuple, "
                "or set. Got %s of type %s" % (collections, type(collections)))
        if constraint is not None and not callable(constraint):
            raise ValueError("The `constraint` argument must be a callable.")

        self._trainable = trainable
        if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
            collections = list(collections) + [
                ops.GraphKeys.TRAINABLE_VARIABLES
            ]
        self._save_slice_info = None
        self._in_graph_mode = context.in_graph_mode()
        with ops.control_dependencies(None):
            with ops.name_scope(
                    name, "Variable",
                [] if init_from_fn else [initial_value]) as name:
                # pylint: disable=protected-access
                handle_name = ops._name_from_scope_name(name)
                if init_from_fn:
                    # Use attr_scope and device(None) to simulate the behavior of
                    # colocate_with when the variable we want to colocate with doesn't
                    # yet exist.
                    if self._in_graph_mode:
                        attr = attr_value_pb2.AttrValue(
                            list=attr_value_pb2.AttrValue.ListValue(
                                s=[compat.as_bytes("loc:@%s" % handle_name)]))
                        with ops.get_default_graph()._attr_scope(
                            {"_class": attr}):
                            with ops.name_scope("Initializer"), ops.device(
                                    None):
                                initial_value = ops.convert_to_tensor(
                                    initial_value(),
                                    name="initial_value",
                                    dtype=dtype)
                            self._handle = _eager_safe_variable_handle(
                                shape=initial_value.get_shape(),
                                dtype=initial_value.dtype.base_dtype,
                                shared_name=handle_name,
                                name=name,
                                graph_mode=self._in_graph_mode)
                            self._handle_device = (
                                self._handle.device if self._in_graph_mode else
                                context.get_default_context().device_name)
                    else:
                        initial_value = initial_value()
                        with ops.name_scope("Initializer"):
                            initial_value = ops.convert_to_tensor(
                                initial_value,
                                name="initial_value",
                                dtype=dtype)
                        self._handle = _eager_safe_variable_handle(
                            shape=initial_value.get_shape(),
                            dtype=initial_value.dtype.base_dtype,
                            shared_name=handle_name,
                            name=name,
                            graph_mode=False,
                            container="")
                        self._handle_device = (
                            self._handle.device if self._in_graph_mode else
                            context.get_default_context().device_name)
                # pylint: enable=protected-access

                # Or get the initial value from a Tensor or Python object.
                else:
                    with ops.name_scope("Initializer"):
                        initial_value = ops.convert_to_tensor(
                            initial_value, name="initial_value", dtype=dtype)
                    # pylint: disable=protected-access
                    if (self._in_graph_mode and initial_value is not None
                            and initial_value.op._get_control_flow_context()
                            is not None):
                        raise ValueError(
                            "Initializer for variable %s is from inside a control-flow "
                            "construct, such as a loop or conditional. When creating a "
                            "variable inside a loop or conditional, use a lambda as the "
                            "initializer." % name)
                    # pylint: enable=protected-access
                    self._handle = _eager_safe_variable_handle(
                        shape=initial_value.get_shape(),
                        dtype=initial_value.dtype.base_dtype,
                        shared_name=handle_name,
                        name=name,
                        graph_mode=self._in_graph_mode,
                        container="")
                    self._handle_device = (
                        self._handle.device if self._in_graph_mode else
                        context.get_default_context().device_name)

                self._initial_value = initial_value if self._in_graph_mode else None
                self._handle_name = handle_name + ":0"
                self._dtype = initial_value.dtype.base_dtype
                self._constraint = constraint

                if self._in_graph_mode:
                    with ops.name_scope("IsInitialized"):
                        self._is_initialized_op = (
                            gen_resource_variable_ops.var_is_initialized_op(
                                self._handle))
                    if initial_value is not None:
                        with ops.name_scope("Assign") as n, ops.colocate_with(
                                self._handle):
                            self._initializer_op = (
                                gen_resource_variable_ops.assign_variable_op(
                                    self._handle,
                                    self._build_initializer_expr(
                                        initial_value),
                                    name=n))
                    with ops.name_scope("Read"), ops.colocate_with(
                            self._handle):
                        # Manually assign reads to the handle's device to avoid log
                        # messages.
                        with ops.device(self._handle_device):
                            value = self._read_variable_op()
                        self._graph_element = value
                        if caching_device is not None:
                            # Variables may be created in a tf.device() or ops.colocate_with()
                            # context. At the same time, users would expect caching device to
                            # be independent of this context, and/or would not expect the
                            # current device context to be merged with the caching device
                            # spec.  Therefore we reset the colocation stack before creating
                            # the cached value. Note that resetting the colocation stack will
                            # also reset the device stack.
                            with ops.colocate_with(None, ignore_existing=True):
                                with ops.device(caching_device):
                                    self._cached_value = array_ops.identity(
                                        value)
                        else:
                            self._cached_value = None
                else:
                    gen_resource_variable_ops.assign_variable_op(
                        self._handle, initial_value)
                    self._is_initialized_op = None
                    self._initializer_op = None
                    self._graph_element = None
                    if caching_device:
                        with ops.device(caching_device):
                            self._cached_value = self._read_variable_op()
                    else:
                        self._cached_value = None
                if context.in_graph_mode():
                    ops.add_to_collections(collections, self)
                elif ops.GraphKeys.GLOBAL_STEP in collections:
                    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
Example #34
0
 def Body(v):
     with ops.colocate_with(external_t):
         return v * v
Example #35
0
    def _init_from_args(self,
                        initial_value=None,
                        trainable=True,
                        collections=None,
                        validate_shape=True,
                        caching_device=None,
                        name=None,
                        dtype=None):
        """Creates a new variable from arguments.

    Args:
      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
        which is the initial value for the Variable. The initial value must have
        a shape specified unless `validate_shape` is set to False. Can also be a
        callable with no argument that returns the initial value when called. In
        that case, `dtype` must be specified. (Note that initializer functions
        from init_ops.py must first be bound to a shape before being used here.)
      trainable: If `True`, the default, also adds the variable to the graph
        collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
        the default list of variables to use by the `Optimizer` classes.
      collections: List of graph collections keys. The new variable is added to
        these collections. Defaults to `[GraphKeys.VARIABLES]`.
      validate_shape: If `False`, allows the variable to be initialized with a
        value of unknown shape. If `True`, the default, the shape of
        `initial_value` must be known.
      caching_device: Optional device string or function describing where the
        Variable should be cached for reading.  Defaults to the Variable's
        device.  If not `None`, caches on another device.  Typical use is to
        cache on the device where the Ops using the Variable reside, to
        deduplicate copying through `Switch` and other conditional statements.
      name: Optional name for the variable. Defaults to `'Variable'` and gets
        uniquified automatically.
      dtype: If set, initial_value will be converted to the given type.
        If None, either the datatype will be kept (if initial_value is
       a Tensor) or float32 will be used (if it is a Python object convertible
       to a Tensor).

    Raises:
      ValueError: If the initial value is not specified, or does not have a
        shape and `validate_shape` is `True`.
    """
        if initial_value is None:
            raise ValueError("initial_value must be specified.")
        init_from_fn = callable(initial_value)
        if init_from_fn and dtype is None:
            raise ValueError(
                "dtype must also be specified when initial_value is callable.")

        if collections is None:
            collections = [ops.GraphKeys.VARIABLES]
        if not isinstance(collections, (list, tuple, set)):
            raise ValueError(
                "collections argument to Variable constructor must be a list, tuple, "
                "or set. Got %s of type %s" % (collections, type(collections)))
        if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
            collections = list(collections) + [
                ops.GraphKeys.TRAINABLE_VARIABLES
            ]
        with ops.control_dependencies(None):
            with ops.name_scope(
                    name, "Variable",
                [] if init_from_fn else [initial_value]) as name:

                # Get the initial value from a callable function. The real shape of the
                # variable will be set later, since under the init_from_fn case, the
                # shape won't be known until after the function is invoked.
                if init_from_fn:
                    self._variable = state_ops.variable_op([],
                                                           dtype.base_dtype,
                                                           set_shape=False,
                                                           name=name)
                    with ops.colocate_with(self._variable.op):
                        with ops.name_scope("Initializer"):
                            # Colocate the tensors created by the initial_value() function
                            # with the variable itself.
                            self._initial_value = ops.convert_to_tensor(
                                initial_value(),
                                name="initial_value",
                                dtype=dtype)

                # Or get the initial value from a Tensor or Python object.
                else:
                    self._initial_value = ops.convert_to_tensor(
                        initial_value, name="initial_value", dtype=dtype)
                    # In this case, the variable op can't be created until after the
                    # initial_value has been converted to a Tensor with a known type.
                    self._variable = state_ops.variable_op(
                        [],
                        self._initial_value.dtype.base_dtype,
                        set_shape=False,
                        name=name)

                # Manually overrides the variable's shape with the initial value's.
                if validate_shape:
                    initial_value_shape = self._initial_value.get_shape()
                    if not initial_value_shape.is_fully_defined():
                        raise ValueError(
                            "initial_value must have a shape specified: %s" %
                            self._initial_value)
                    self._variable.set_shape(initial_value_shape)
                    # TODO(b/28152992): Remove the below hack modifying the node_def shape
                    # directly once set_shape() handles it.
                    self._variable.op.node_def.attr["shape"].shape.CopyFrom(
                        initial_value_shape.as_proto())

                # Assigns initial value.
                self._initializer_op = state_ops.assign(
                    self._variable,
                    self._initial_value,
                    validate_shape=validate_shape).op

                # TODO(vrv): Change this class to not take caching_device, but
                # to take the op to colocate the snapshot with, so we can use
                # colocation rather than devices.
                if caching_device is not None:
                    with ops.device(caching_device):
                        self._snapshot = array_ops.identity(self._variable,
                                                            name="read")
                else:
                    with ops.colocate_with(self._variable.op):
                        self._snapshot = array_ops.identity(self._variable,
                                                            name="read")

        ops.add_to_collections(collections, self)
        self._caching_device = caching_device
        self._save_slice_info = None
Example #36
0
    def add_variable(self,
                     name,
                     shape,
                     dtype=None,
                     initializer=None,
                     regularizer=None,
                     trainable=True):
        """Adds a new variable to the layer, or gets an existing one; returns it.

    Arguments:
      name: variable name.
      shape: variable shape.
      dtype: The type of the variable. Defaults to `self.dtype`.
      initializer: initializer instance (callable).
      regularizer: regularizer instance (callable).
      trainable: whether the variable should be part of the layer's
        "trainable_variables" (e.g. variables, biases)
        or "non_trainable_variables" (e.g. BatchNorm mean, stddev).

    Returns:
      The created variable.
    """
        if dtype is None:
            dtype = self.dtype
        existing_variables = set(tf_variables.global_variables())

        self._set_scope(None)

        with vs.variable_scope(self._scope, reuse=self.built
                               or self._reuse) as scope:
            with ops.name_scope(scope.original_name_scope):
                variable = vs.get_variable(name,
                                           shape=shape,
                                           initializer=initializer,
                                           dtype=dtypes.as_dtype(dtype),
                                           trainable=trainable
                                           and self.trainable)
                if variable in existing_variables:
                    return variable
                if regularizer:
                    # To match the behavior of tf.get_variable(), we only
                    # apply regularization if the variable is newly created.
                    if isinstance(variable, tf_variables.PartitionedVariable):
                        for v in variable:
                            with ops.colocate_with(v.op):
                                with ops.name_scope(name + '/Regularizer'):
                                    regularization = regularizer(v)
                            if regularization is not None:
                                self.add_loss(regularization)
                                _add_elements_to_collection(
                                    regularization,
                                    ops.GraphKeys.REGULARIZATION_LOSSES)
                    else:
                        with ops.colocate_with(variable.op):
                            with ops.name_scope(name + '/Regularizer'):
                                regularization = regularizer(variable)
                        if regularization is not None:
                            self.add_loss(regularization)
                            _add_elements_to_collection(
                                regularization,
                                ops.GraphKeys.REGULARIZATION_LOSSES)
        if trainable:
            self._trainable_weights.append(variable)
        else:
            self._non_trainable_weights.append(variable)
        return variable
  def _mini_batch_training_op(self, inputs, cluster_idx_list,
                              cluster_centers, total_counts):
    """Creates an op for training for mini batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.
      total_counts: Tensor Ref of cluster counts.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
    update_ops = []
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
      with ops.colocate_with(inp):
        assert total_counts is not None
        cluster_idx = array_ops.reshape(cluster_idx, [-1])
        # Dedupe the unique ids of cluster_centers being updated so that updates
        # can be locally aggregated.
        unique_ids, unique_idx = array_ops.unique(cluster_idx)
        num_unique_cluster_idx = array_ops.size(unique_ids)
        # Fetch the old values of counts and cluster_centers.
        with ops.colocate_with(total_counts):
          old_counts = array_ops.gather(total_counts, unique_ids)
        # TODO(agarwal): This colocation seems to run into problems. Fix it.
        # with ops.colocate_with(cluster_centers):
        old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
        # Locally aggregate the increment to counts.
        count_updates = math_ops.unsorted_segment_sum(
            array_ops.ones_like(
                unique_idx, dtype=total_counts.dtype),
            unique_idx,
            num_unique_cluster_idx)
        # Locally compute the sum of inputs mapped to each id.
        # For a cluster with old cluster value x, old count n, and with data
        # d_1,...d_k newly assigned to it, we recompute the new value as
        # x += (sum_i(d_i) - k * x) / (n + k).
        # Compute sum_i(d_i), see comment above.
        cluster_center_updates = math_ops.unsorted_segment_sum(
            inp, unique_idx, num_unique_cluster_idx)
        # Shape to enable broadcasting count_updates and learning_rate to inp.
        # It extends the shape with 1's to match the rank of inp.
        broadcast_shape = array_ops.concat(
            [
                array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones(
                    array_ops.reshape(array_ops.rank(inp) - 1, [1]),
                    dtype=dtypes.int32)
            ],
            0)
        # Subtract k * x, see comment above.
        cluster_center_updates -= math_ops.cast(
            array_ops.reshape(count_updates, broadcast_shape),
            inp.dtype) * old_cluster_centers
        learning_rate = math_ops.reciprocal(
            math_ops.cast(old_counts + count_updates, inp.dtype))
        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
        # scale by 1 / (n + k), see comment above.
        cluster_center_updates *= learning_rate
        # Apply the updates.
      update_counts = state_ops.scatter_add(
          total_counts,
          unique_ids,
          count_updates)
      update_cluster_centers = state_ops.scatter_add(
          cluster_centers,
          unique_ids,
          cluster_center_updates)
      update_ops.extend([update_counts, update_cluster_centers])
    return control_flow_ops.group(*update_ops)
Example #38
0
 def value(self):
     """A cached operation which reads the value of this variable."""
     if self._cached_value is not None:
         return self._cached_value
     with ops.colocate_with(None, ignore_existing=True):
         return self._read_variable_op()
Example #39
0
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This is the second part of `minimize()`. It returns an `Operation` that
    applies gradients.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        `compute_gradients()`.
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the `Optimizer` constructor.

    Returns:
      An `Operation` that applies the specified gradients. If `global_step`
      was not None, that operation also increments `global_step`.

    Raises:
      TypeError: If `grads_and_vars` is malformed.
      ValueError: If none of the variables have gradients.
    """
        # This is a default implementation of apply_gradients() that can be shared
        # by most optimizers.  It relies on the subclass implementing the following
        # methods: _create_slots(), _prepare(), _apply_dense(), and _apply_sparse().

        grads_and_vars = tuple(
            grads_and_vars)  # Make sure repeat iteration works.
        if not grads_and_vars:
            raise ValueError("No variables provided.")
        converted_grads_and_vars = []
        for g, v in grads_and_vars:
            if g is not None:
                try:
                    # Convert the grad to Tensor or IndexedSlices if necessary.
                    g = ops.convert_to_tensor_or_indexed_slices(g)
                except TypeError:
                    raise TypeError("Gradient must be convertible to a Tensor"
                                    " or IndexedSlices, or None: %s" % g)
                if not isinstance(g, (ops.Tensor, ops.IndexedSlices)):
                    raise TypeError(
                        "Gradient must be a Tensor, IndexedSlices, or None: %s"
                        % g)
            p = _get_processor(v)
            converted_grads_and_vars.append((g, v, p))

        converted_grads_and_vars = tuple(converted_grads_and_vars)
        var_list = [v for g, v, _ in converted_grads_and_vars if g is not None]
        if not var_list:
            raise ValueError("No gradients provided for any variable: %s." %
                             ([str(v)
                               for _, _, v in converted_grads_and_vars], ))
        with ops.init_scope():
            self._create_slots([_get_variable_for(v) for v in var_list])
        update_ops = []
        with ops.name_scope(name, self._name) as name:
            self._prepare()
            for grad, var, processor in converted_grads_and_vars:
                if grad is None:
                    continue
                # We colocate all ops created in _apply_dense or _apply_sparse
                # on the same device as the variable.
                # TODO(apassos): figure out how to get the variable name here.
                scope_name = "" if context.executing_eagerly() else var.op.name
                with ops.name_scope("update_" +
                                    scope_name), ops.colocate_with(var):
                    update_ops.append(processor.update_op(self, grad))
            if global_step is None:
                apply_updates = self._finish(update_ops, name)
            else:
                with ops.control_dependencies(
                    [self._finish(update_ops, "update")]):
                    with ops.colocate_with(global_step):
                        if isinstance(global_step,
                                      resource_variable_ops.ResourceVariable):
                            # TODO(apassos): the implicit read in assign_add is slow; consider
                            # making it less so.
                            apply_updates = resource_variable_ops.assign_add_variable_op(
                                global_step.handle,
                                ops.convert_to_tensor(1,
                                                      dtype=global_step.dtype),
                                name=name)
                        else:
                            apply_updates = state_ops.assign_add(global_step,
                                                                 1,
                                                                 name=name)

            if not context.executing_eagerly():
                if isinstance(apply_updates, ops.Tensor):
                    apply_updates = apply_updates.op
                train_op = ops.get_collection_ref(ops.GraphKeys.TRAIN_OP)
                if apply_updates not in train_op:
                    train_op.append(apply_updates)

            return apply_updates
    def _init_from_args(
        self,
        initial_value=None,
        trainable=None,
        collections=None,
        caching_device=None,
        name=None,
        dtype=None,
        constraint=None,
        synchronization=None,
        aggregation=None,
        distribute_strategy=None,
        shape=None,
    ):
        """Creates a variable.

        Args:
          initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
            which is the initial value for the Variable. The initial value must have
            a shape specified unless `validate_shape` is set to False. Can also be a
            callable with no argument that returns the initial value when called.
            (Note that initializer functions from init_ops.py must first be bound
             to a shape before being used here.)
          trainable: If `True`, the default, also adds the variable to the graph
            collection `GraphKeys.TRAINABLE_VARIABLES`. This collection is used as
            the default list of variables to use by the `Optimizer` classes.
            Defaults to `True`, unless `synchronization` is set to `ON_READ`, in
            which case it defaults to `False`.
          collections: List of graph collections keys. The new variable is added to
            these collections. Defaults to `[GraphKeys.GLOBAL_VARIABLES]`.
          caching_device: Optional device string or function describing where the
            Variable should be cached for reading.  Defaults to the Variable's
            device.  If not `None`, caches on another device.  Typical use is to
            cache on the device where the Ops using the Variable reside, to
            deduplicate copying through `Switch` and other conditional statements.
          name: Optional name for the variable. Defaults to `'Variable'` and gets
            uniquified automatically.
          dtype: If set, initial_value will be converted to the given type.
            If None, either the datatype will be kept (if initial_value is
           a Tensor) or float32 will be used (if it is a Python object convertible
           to a Tensor).
          constraint: An optional projection function to be applied to the variable
            after being updated by an `Optimizer` (e.g. used to implement norm
            constraints or value constraints for layer weights). The function must
            take as input the unprojected Tensor representing the value of the
            variable and return the Tensor for the projected value
            (which must have the same shape). Constraints are not safe to
            use when doing asynchronous distributed training.
          synchronization: Indicates when a distributed a variable will be
            aggregated. Accepted values are constants defined in the class
            `tf.VariableSynchronization`. By default the synchronization is set to
            `AUTO` and the current `DistributionStrategy` chooses
            when to synchronize.
          aggregation: Indicates how a distributed variable will be aggregated.
            Accepted values are constants defined in the class
            `tf.VariableAggregation`.
          distribute_strategy: DistributionStrategy under which this variable
            was created.
          shape: (optional) The shape of this variable. If None, the shape of
            `initial_value` will be used. When setting this argument to
            `tf.TensorShape(None)` (representing an unspecified shape), the variable
            can be assigned with values of different shapes.

        Raises:
          ValueError: If the initial value is not specified, or does not have a
            shape and `validate_shape` is `True`.

        @compatibility(eager)
        When Eager Execution is enabled, variables are never added to collections.
        It is not implicitly added to the `GLOBAL_VARIABLES` or
        `TRAINABLE_VARIABLES` collections, and the `collections` argument is
        ignored.
        @end_compatibility
        """
        (
            synchronization,
            aggregation,
            trainable,
        ) = variables.validate_synchronization_aggregation_trainable(
            synchronization, aggregation, trainable, name)
        if initial_value is None:
            raise ValueError("initial_value must be specified.")
        init_from_fn = callable(initial_value)

        if (isinstance(initial_value, ops.Tensor)
                and hasattr(initial_value, "graph")
                and initial_value.graph.building_function):
            raise ValueError(
                "Tensor-typed variable initializers must either be "
                "wrapped in an init_scope or callable "
                "(e.g., `tf.Variable(lambda : "
                "tf.truncated_normal([10, 40]))`) when building "
                "functions. Please file a feature request if this "
                "restriction inconveniences you.")

        if collections is None:
            collections = [ops.GraphKeys.GLOBAL_VARIABLES]
        if not isinstance(collections, (list, tuple, set)):
            raise ValueError(
                "collections argument to Variable constructor must be a list, tuple, "
                "or set. Got %s of type %s" % (collections, type(collections)))
        if constraint is not None and not callable(constraint):
            raise ValueError("The `constraint` argument must be a callable.")

        if isinstance(initial_value, trackable.CheckpointInitialValue):
            self._maybe_initialize_trackable()
            self._update_uid = initial_value.checkpoint_position.restore_uid
            initial_value = initial_value.wrapped_value

        if trainable and ops.GraphKeys.TRAINABLE_VARIABLES not in collections:
            collections = list(collections) + [
                ops.GraphKeys.TRAINABLE_VARIABLES
            ]
        with ops.init_scope():
            self._in_graph_mode = not context.executing_eagerly()
            with ops.name_scope(
                    name, "TrainableWrapper",
                [] if init_from_fn else [initial_value]) as name:
                # pylint: disable=protected-access
                handle_name = ops.name_from_scope_name(name)
                handle_name = handle_name or "TrainableWrapperHandle"
                if self._in_graph_mode:
                    shared_name = handle_name
                    unique_id = shared_name
                else:
                    # When in eager mode use a uid for the shared_name, to prevent
                    # accidental sharing.
                    unique_id = "%s_%d" % (handle_name, ops.uid())
                    shared_name = None  # Never shared
                # Use attr_scope and device(None) to simulate the behavior of
                # colocate_with when the variable we want to colocate with doesn't
                # yet exist.
                device_context_manager = (ops.device if self._in_graph_mode
                                          else ops.NullContextmanager)
                attr = attr_value_pb2.AttrValue(
                    list=attr_value_pb2.AttrValue.ListValue(
                        s=[compat.as_bytes("loc:@%s" % handle_name)]))
                with ops.get_default_graph()._attr_scope({"_class": attr}):
                    with ops.name_scope("Initializer"), device_context_manager(
                            None):
                        initial_value = ops.convert_to_tensor(
                            initial_value() if init_from_fn else initial_value,
                            name="initial_value",
                            dtype=dtype,
                        )
                    if shape is None:
                        shape = initial_value.shape
                    handle = resource_variable_ops.eager_safe_variable_handle(
                        initial_value=initial_value,
                        shape=None,  # shape,
                        shared_name=shared_name,
                        name=name,
                        graph_mode=self._in_graph_mode,
                    )
                # pylint: disable=protected-access
                if (self._in_graph_mode and initial_value is not None
                        and initial_value.op._get_control_flow_context()
                        is not None):
                    raise ValueError(
                        "Initializer for variable %s is from inside a control-flow "
                        "construct, such as a loop or conditional. When creating a "
                        "variable inside a loop or conditional, use a lambda as the "
                        "initializer." % name)
                # pylint: enable=protected-access
                dtype = initial_value.dtype.base_dtype

                if self._in_graph_mode:
                    with ops.name_scope("IsInitialized"):
                        is_initialized_op = (gen_resource_variable_ops.
                                             var_is_initialized_op(handle))
                    if initial_value is not None:
                        # pylint: disable=g-backslash-continuation
                        with ops.name_scope("Assign") as n, ops.colocate_with(
                                None, ignore_existing=True), ops.device(
                                    handle.device):
                            # pylint: disable=protected-access
                            initializer_op = gen_resource_variable_ops.assign_variable_op(
                                handle,
                                variables.
                                _try_guard_against_uninitialized_dependencies(
                                    name, initial_value),
                                name=n,
                            )
                            # pylint: enable=protected-access
                        # pylint: enable=g-backslash-continuation
                    with ops.name_scope("Read"):
                        # Manually assign reads to the handle's device to avoid log
                        # messages.
                        with ops.device(handle.device):
                            with ops.control_dependencies([
                                    gen_resource_variable_ops.
                                    assign_variable_op(
                                        handle,
                                        self.prefetch_values(),
                                        name="AssignBeforeInitRead",
                                    )
                            ]):
                                value = gen_resource_variable_ops.read_variable_op(
                                    handle, dtype)
                        graph_element = value
                        if caching_device is not None:
                            # Variables may be created in a tf.device() or ops.colocate_with()
                            # context. At the same time, users would expect caching device to
                            # be independent of this context, and/or would not expect the
                            # current device context to be merged with the caching device
                            # spec.  Therefore we reset the colocation stack before creating
                            # the cached value. Note that resetting the colocation stack will
                            # also reset the device stack.
                            with ops.colocate_with(None, ignore_existing=True):
                                with ops.device(caching_device):
                                    cached_value = array_ops.identity(value)
                        else:
                            cached_value = None
                else:
                    gen_resource_variable_ops.assign_variable_op(
                        handle, initial_value)
                    is_initialized_op = None
                    initializer_op = None
                    graph_element = None
                    if caching_device:
                        with ops.device(caching_device):
                            with ops.control_dependencies([
                                    gen_resource_variable_ops.
                                    assign_variable_op(
                                        handle,
                                        self.prefetch_values(),
                                        name="AssignBeforeInitRead",
                                    )
                            ]):
                                cached_value = (
                                    gen_resource_variable_ops.read_variable_op(
                                        handle, dtype))
                    else:
                        cached_value = None
                if not context.executing_eagerly():
                    # Eager variables are only added to collections if they are part of an
                    # eager variable store (otherwise in an interactive session they would
                    # hog memory and cause OOM). This is done in ops/variable_scope.py.
                    ops.add_to_collections(collections, self)
                elif ops.GraphKeys.GLOBAL_STEP in collections:
                    ops.add_to_collections(ops.GraphKeys.GLOBAL_STEP, self)
            initial_value = initial_value if self._in_graph_mode else None
            super(resource_variable_ops.ResourceVariable, self).__init__(
                trainable=trainable,
                shape=shape,
                dtype=dtype,
                handle=handle,
                synchronization=synchronization,
                constraint=constraint,
                aggregation=aggregation,
                distribute_strategy=distribute_strategy,
                name=name,
                unique_id=unique_id,
                handle_name=handle_name,
                graph_element=graph_element,
                initial_value=initial_value,
                initializer_op=initializer_op,
                is_initialized_op=is_initialized_op,
                cached_value=cached_value,
            )
Example #41
0
    def _renorm_correction_and_moments(self, mean, variance, training):
        """Returns the correction and update values for renorm."""
        stddev = math_ops.sqrt(variance + self.epsilon)
        # Compute the average mean and standard deviation, as if they were
        # initialized with this batch's moments.
        mixed_renorm_mean = (self.renorm_mean +
                             (1. - self.renorm_mean_weight) * mean)
        mixed_renorm_stddev = (self.renorm_stddev +
                               (1. - self.renorm_stddev_weight) * stddev)
        # Compute the corrections for batch renorm.
        r = stddev / mixed_renorm_stddev
        d = (mean - mixed_renorm_mean) / mixed_renorm_stddev
        # Ensure the corrections use pre-update moving averages.
        with ops.control_dependencies([r, d]):
            mean = array_ops.identity(mean)
            stddev = array_ops.identity(stddev)
        rmin, rmax, dmax = [
            self.renorm_clipping.get(key) for key in ['rmin', 'rmax', 'dmax']
        ]
        if rmin is not None:
            r = math_ops.maximum(r, rmin)
        if rmax is not None:
            r = math_ops.minimum(r, rmax)
        if dmax is not None:
            d = math_ops.maximum(d, -dmax)
            d = math_ops.minimum(d, dmax)
        # When not training, use r=1, d=0.
        r = utils.smart_cond(training, lambda: r,
                             lambda: array_ops.ones_like(r))
        d = utils.smart_cond(training, lambda: d,
                             lambda: array_ops.zeros_like(d))

        def _update_renorm_variable(var, weight, value):
            """Updates a moving average and weight, returns the unbiased value."""
            value = array_ops.identity(value)

            def _do_update():
                # Update the variables without zero debiasing. The debiasing will be
                # accomplished by dividing the exponential moving average by the weight.
                # For example, after a single update, the moving average would be
                # (1-decay) * value. and the weight will be 1-decay, with their ratio
                # giving the value.
                # Make sure the weight is not updated until before r and d computation.
                with ops.control_dependencies([value]):
                    weight_value = array_ops.constant(1., dtype=weight.dtype)
                new_var = moving_averages.assign_moving_average(
                    var, value, self.renorm_momentum, zero_debias=False)
                new_weight = moving_averages.assign_moving_average(
                    weight,
                    weight_value,
                    self.renorm_momentum,
                    zero_debias=False)
                return new_var / new_weight

            def _fake_update():
                return array_ops.identity(var)

            return utils.smart_cond(training, _do_update, _fake_update)

        with ops.colocate_with(self.moving_mean):
            new_mean = _update_renorm_variable(self.renorm_mean,
                                               self.renorm_mean_weight, mean)
        with ops.colocate_with(self.moving_variance):
            new_stddev = _update_renorm_variable(self.renorm_stddev,
                                                 self.renorm_stddev_weight,
                                                 stddev)
            # Make sqrt(moving_variance + epsilon) = new_stddev.
            new_variance = math_ops.square(new_stddev) - self.epsilon

        return (r, d, new_mean, new_variance)
Example #42
0
def _embedding_lookup_and_transform(params,
                                    ids,
                                    partition_strategy="mod",
                                    name=None,
                                    max_norm=None,
                                    transform_fn=None):
  """Helper function for embedding_lookup and _compute_sampled_logits.

  This function is a generalization of embedding_lookup that optionally
  applies a caller-specified transformation to each embedding. This is
  done through the `transform_fn` argument. If provided, the function is
  applied to each partitioned tensor of retrieved embeddings, colocated
  with the embeddings. This function will be called with a single `Tensor`
  argument of the same type as the `params` tensor and should return a
  `Tensor`. The shape of the argument will be the same as `params` except
  for the size of the first dimension. The first dimension of the result's
  shape must be the same size as the argument's.

  Args:
    params: See embedding_lookup.
    ids: See embedding_lookup.
    partition_strategy: See embedding_lookup.
    name: See embedding_lookup.
    max_norm: See embedding_lookup.
    transform_fn: An optional function to apply to each retrieved embedding. If
      max_norm is provided, transform_fn is applied to the norm-limited
      embeddings.

  Returns:
    See embedding_lookup for details.
  Raises:
    ValueError: If `params` is empty.
  """
  if params is None:
    raise ValueError("params must be specified")
  if isinstance(params, (list, tuple)) and not params:
    raise ValueError("Need at least one param")
  if isinstance(params, variables.PartitionedVariable):
    params = list(params)  # Iterate to get the underlying Variables.
  if not isinstance(params, list):
    params = [params]

  with ops.name_scope(name, "embedding_lookup", params + [ids]) as name:
    np = len(params)  # Number of partitions
    # Preserve the resource variable status to avoid accidental dense reads.
    if not any(
        isinstance(p, resource_variable_ops.BaseResourceVariable)
        for p in params):
      params = ops.convert_n_to_tensor_or_indexed_slices(params, name="params")
    ids = ops.convert_to_tensor(ids, name="ids")
    if np == 1 and (not transform_fn or ids.get_shape().ndims == 1):
      with ops.colocate_with(params[0]):
        result = _clip(
            array_ops.gather(params[0], ids, name=name), ids, max_norm)
        if transform_fn:
          result = transform_fn(result)
      # Make sure the final result does not have colocation constraints on the
      # params. Similar to the case np > 1 where parallel_dynamic_stitch is
      # outside the scioe of all with ops.colocate_with(params[p]).
      return array_ops.identity(result)
    else:
      # Flatten the ids. There are two cases where we need to do this.
      # - There is more than one params tensor.
      # - There is a transform_fn and ids is not statically known to be 1-D.
      #   We must flatten in this case because transform_fn expects a flat
      #   tensor of embeddings.
      flat_ids = array_ops.reshape(ids, [-1])
      original_indices = math_ops.range(array_ops.size(flat_ids))

      # Create p_assignments and set new_ids depending on the strategy.
      if partition_strategy == "mod":
        p_assignments = flat_ids % np
        new_ids = flat_ids // np
      elif partition_strategy == "div":
        # Compute num_total_ids as the sum of dim-0 of params, then assign to
        # partitions based on a constant number of ids per partition. Optimize
        # if we already know the full shape statically.
        dim_0_size = tensor_shape.Dimension(
            tensor_shape.dimension_value(params[0].get_shape()[0]))
        for p in xrange(1, np):
          dim_0_size += tensor_shape.Dimension(
              tensor_shape.dimension_value(params[p].get_shape()[0]))
        if dim_0_size.value:
          num_total_ids = constant_op.constant(dim_0_size.value, flat_ids.dtype)
        else:
          dim_0_sizes = []
          for p in xrange(np):
            param_p_dim = tensor_shape.dimension_value(params[p].get_shape()[0])
            if param_p_dim is not None:
              dim_0_sizes.append(param_p_dim)
            else:
              with ops.colocate_with(params[p]):
                dim_0_sizes.append(array_ops.shape(params[p])[0])
          num_total_ids = math_ops.reduce_sum(
              math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
        ids_per_partition = num_total_ids // np
        extras = num_total_ids % np

        p_assignments = math_ops.maximum(flat_ids // (ids_per_partition + 1),
                                         (flat_ids - extras) //
                                         ids_per_partition)

        # Emulate a conditional using a boolean indicator tensor
        new_ids = array_ops.where(p_assignments < extras,
                                  flat_ids % (ids_per_partition + 1),
                                  (flat_ids - extras) % ids_per_partition)
      else:
        raise ValueError("Unrecognized partition strategy: " +
                         partition_strategy)

      # Cast partition assignments to int32 for use in dynamic_partition.
      # There really should not be more than 2^32 partitions.
      p_assignments = math_ops.cast(p_assignments, dtypes.int32)
      # Partition list of ids based on assignments into np separate lists
      gather_ids = data_flow_ops.dynamic_partition(new_ids, p_assignments, np)
      # Similarly, partition the original indices.
      pindices = data_flow_ops.dynamic_partition(original_indices,
                                                 p_assignments, np)
      # Do np separate lookups, finding embeddings for plist[p] in params[p]
      partitioned_result = []
      for p in xrange(np):
        pids = gather_ids[p]
        with ops.device_v2(None):
          with ops.colocate_with(params[p]):
            result = array_ops.gather(params[p], pids)
            if transform_fn:
              # If transform_fn is provided, the clip_by_norm precedes
              # the transform and hence must be co-located. See below
              # for the counterpart if transform_fn is not provided.
              result = transform_fn(_clip(result, pids, max_norm))
        partitioned_result.append(result)
      # Stitch these back together
      ret = data_flow_ops.parallel_dynamic_stitch(
          pindices, partitioned_result, name=name)

      # Determine the static element shape.
      if transform_fn is None:
        element_shape_s = params[0].get_shape()[1:]
        for p in params[1:]:
          element_shape_s = element_shape_s.merge_with(p.get_shape()[1:])
      else:
        element_shape_s = ret.get_shape()[1:]

      # Compute the dynamic element shape.
      if element_shape_s.is_fully_defined():
        element_shape_d = element_shape_s
      elif transform_fn is None:
        # It's important that we compute params[0].shape on the right device
        # to avoid data motion.
        with ops.colocate_with(params[0]):
          params_shape = array_ops.shape(params[0])
        element_shape_d = params_shape[1:]
      else:
        element_shape_d = array_ops.shape(ret)[1:]

      # Reshape to reverse the flattening of ids.
      ret = array_ops.reshape(
          ret, array_ops.concat([array_ops.shape(ids), element_shape_d], 0))

      # Normally the reshape is sufficient, but setting shape explicitly
      # teaches shape inference that params[1:].get_shape() matters
      # (in the case that transform_fn is None).
      ret.set_shape(ids.get_shape().concatenate(element_shape_s))
      if not transform_fn:
        # If transform_fn was provided, the clip_by_norm was done above.
        ret = _clip(ret, ids, max_norm)
      return ret
Example #43
0
            def model_fn():
                if 'CPU' in compute_device:
                    tower_compute_device = '/device:CPU:0'
                else:
                    tower_compute_device = ('/device:GPU:%d' %
                                            distribution_strategy_context.
                                            get_tower_context().tower_id)
                tower_compute_device = device_util.canonicalize(
                    tower_compute_device)

                if 'CPU' in variable_device:
                    tower_variable_device = '/device:CPU:0'
                else:
                    tower_variable_device = ('/device:GPU:%d' %
                                             distribution_strategy_context.
                                             get_tower_context().tower_id)
                tower_variable_device = device_util.canonicalize(
                    tower_variable_device)

                a = constant_op.constant(1.0)
                b = constant_op.constant(2.0)
                c = a + b
                self.assertEqual(a.device, tower_compute_device)
                self.assertEqual(b.device, tower_compute_device)
                self.assertEqual(c.device, tower_compute_device)

                # The device scope is ignored for variables but not for normal ops.
                with ops.device('/device:GPU:2'):
                    x = variable_scope.get_variable('x', initializer=10.0)
                    x_add = x.assign_add(c)
                    e = a + c
                self.assertEqual(device_util.canonicalize(x.device),
                                 tower_variable_device)
                self.assertEqual(x_add.device, x.device)
                self.assertEqual(e.device,
                                 device_util.canonicalize('/device:GPU:2'))

                # The colocate_vars_with can override the distribution's device.
                with d.colocate_vars_with(x):
                    y = variable_scope.get_variable('y', initializer=20.0)
                y_add = y.assign_add(x_add)
                self.assertEqual(device_util.canonicalize(y.device),
                                 tower_variable_device)
                self.assertEqual(y_add.device, y.device)
                self.assertEqual(y.device, x.device)

                z = variable_scope.get_variable('z', initializer=10.0)
                self.assertEqual(device_util.canonicalize(z.device),
                                 tower_variable_device)

                with ops.control_dependencies([y_add]):
                    z_add = z.assign_add(y)
                with ops.control_dependencies([z_add]):
                    f = z + c
                self.assertEqual(f.device, tower_compute_device)

                # The device scope would merge with the default worker device.
                with ops.device('/CPU:1'):
                    g = e + 1.0
                self.assertEqual(g.device,
                                 device_util.canonicalize('/device:CPU:1'))

                # Ths ops.colocate_with will be ignored when defining a variale but not
                # for a normal tensor.
                with ops.colocate_with(x):
                    u = variable_scope.get_variable('u', initializer=30.0)
                    h = f + 1.0
                self.assertEqual(device_util.canonicalize(u.device),
                                 tower_variable_device)
                self.assertEqual(device_util.canonicalize(x.device), h.device)
                return y_add, z_add, f
 def _assign_singular_vector(self, variable, value):
     with K.name_scope('AssignSingularVector') as scope:
         with ops.colocate_with(variable):
             return state_ops.assign(variable, value, name=scope)
Example #45
0
 def fn():
   with ops.colocate_with(b.op):
     c = math_ops.add(a, a, name="c")
   return c
def gradients(ys,
              xs,
              grad_ys=None,
              name="gradients",
              colocate_gradients_with_ops=False,
              gate_gradients=False,
              aggregation_method=None,
              stop_gradients=None):
    """Constructs symbolic derivatives of sum of `ys` w.r.t. x in `xs`.

  `ys` and `xs` are each a `Tensor` or a list of tensors.  `grad_ys`
  is a list of `Tensor`, holding the gradients received by the
  `ys`. The list must be the same length as `ys`.

  `gradients()` adds ops to the graph to output the derivatives of `ys` with
  respect to `xs`.  It returns a list of `Tensor` of length `len(xs)` where
  each tensor is the `sum(dy/dx)` for y in `ys`.

  `grad_ys` is a list of tensors of the same length as `ys` that holds
  the initial gradients for each y in `ys`.  When `grad_ys` is None,
  we fill in a tensor of '1's of the shape of y for each y in `ys`.  A
  user can provide their own initial `grad_ys` to compute the
  derivatives using a different initial gradient for each y (e.g., if
  one wanted to weight the gradient differently for each value in
  each y).

  `stop_gradients` is a `Tensor` or a list of tensors to be considered constant
  with respect to all `xs`. These tensors will not be backpropagated through,
  as though they had been explicitly disconnected using `stop_gradient`.  Among
  other things, this allows computation of partial derivatives as opposed to
  total derivatives. For example:

  ```python
  a = tf.constant(0.)
  b = 2 * a
  g = tf.gradients(a + b, [a, b], stop_gradients=[a, b])
  ```

  Here the partial derivatives `g` evaluate to `[1.0, 1.0]`, compared to the
  total derivatives `tf.gradients(a + b, [a, b])`, which take into account the
  influence of `a` on `b` and evaluate to `[3.0, 1.0]`.  Note that the above is
  equivalent to:

  ```python
  a = tf.stop_gradient(tf.constant(0.))
  b = tf.stop_gradient(2 * a)
  g = tf.gradients(a + b, [a, b])
  ```

  `stop_gradients` provides a way of stopping gradient after the graph has
  already been constructed, as compared to `tf.stop_gradient` which is used
  during graph construction.  When the two approaches are combined,
  backpropagation stops at both `tf.stop_gradient` nodes and nodes in
  `stop_gradients`, whichever is encountered first.

  Args:
    ys: A `Tensor` or list of tensors to be differentiated.
    xs: A `Tensor` or list of tensors to be used for differentiation.
    grad_ys: Optional. A `Tensor` or list of tensors the same size as
      `ys` and holding the gradients computed for each y in `ys`.
    name: Optional name to use for grouping all the gradient ops together.
      defaults to 'gradients'.
    colocate_gradients_with_ops: If True, try colocating gradients with
      the corresponding op.
    gate_gradients: If True, add a tuple around the gradients returned
      for an operations.  This avoids some race conditions.
    aggregation_method: Specifies the method used to combine gradient terms.
      Accepted values are constants defined in the class `AggregationMethod`.
    stop_gradients: Optional. A `Tensor` or list of tensors not to differentiate
      through.

  Returns:
    A list of `sum(dy/dx)` for each x in `xs`.

  Raises:
    LookupError: if one of the operations between `x` and `y` does not
      have a registered gradient function.
    ValueError: if the arguments are invalid.
    RuntimeError: if called in Eager mode.

  """
    if context.in_eager_mode():
        raise RuntimeError("tf.gradients not supported in EAGER mode. Use "
                           "functions in tf.contrib.eager.backprop instead.")
    ys = _AsList(ys)
    xs = _AsList(xs)
    stop_gradients = [] if stop_gradients is None else _AsList(stop_gradients)
    if grad_ys is None:
        grad_ys = [None] * len(ys)
    else:
        grad_ys = _AsList(grad_ys)

    with ops.name_scope(
            name, "gradients",
            list(ys) + list(xs) + list(stop_gradients) +
            list(grad_ys)) as grad_scope:
        ys = ops.convert_n_to_tensor_or_indexed_slices(ys, name="y")
        xs = [
            x.handle
            if isinstance(x, resource_variable_ops.ResourceVariable) else x
            for x in xs
        ]
        xs = ops.internal_convert_n_to_tensor_or_indexed_slices(xs,
                                                                name="x",
                                                                as_ref=True)
        grad_ys = _DefaultGradYs(grad_ys, ys, colocate_gradients_with_ops)

        # The approach we take here is as follows: Create a list of all ops in the
        # subgraph between the ys and xs.  Visit these ops in reverse order of ids
        # to ensure that when we visit an op the gradients w.r.t its outputs have
        # been collected.  Then aggregate these gradients if needed, call the op's
        # gradient function, and add the generated gradients to the gradients for
        # its input.

        # Initialize the pending count for ops in the connected subgraph from ys
        # to the xs.
        if len(ys) > 1:
            ys = [array_ops.identity(y) if y.consumers() else y for y in ys]
        to_ops = [t.op for t in ys]
        from_ops = [t.op for t in xs]
        stop_gradient_ops = [t.op for t in stop_gradients]
        pending_count, loop_state = _PendingCount(ops.get_default_graph(),
                                                  to_ops, from_ops,
                                                  colocate_gradients_with_ops)

        # Iterate over the collected ops.
        #
        # grads: op => list of gradients received on each output endpoint of the
        # op.  The gradients for each endpoint are initially collected as a list.
        # When it is time to call the op's gradient function, for each endpoint we
        # aggregate the list of received gradients into a Add() Operation if there
        # is more than one.
        grads = {}

        # Add the initial gradients for the ys.
        for y, grad_y in zip(ys, grad_ys):
            _SetGrad(grads, y, grad_y)

        # Initialize queue with to_ops.
        queue = collections.deque()
        # Add the ops in 'to_ops' into the queue.
        to_ops_set = set()
        for op in to_ops:
            # 'ready' handles the case where one output gradient relies on
            # another output's gradient.
            # pylint: disable=protected-access
            ready = (pending_count[op._id] == 0)
            if ready and op._id not in to_ops_set:
                to_ops_set.add(op._id)
                queue.append(op)
            # pylint: enable=protected-access

        if loop_state:
            loop_exits = loop_state.ProcessUnusedLoopExits(
                pending_count, to_ops_set)
            for y in loop_exits:
                if _IsTrainable(y):
                    _SetGrad(grads, y, loop_state.ZerosLikeForExit(y))
                    queue.append(y.op)

        stop_ops = _StopOps(from_ops, stop_gradient_ops, pending_count)
        while queue:
            # generate gradient subgraph for op.
            op = queue.popleft()
            with _maybe_colocate_with(op, colocate_gradients_with_ops):
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=True)
                out_grads = _AggregatedGrads(grads, op, loop_state,
                                             aggregation_method)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=True)

                grad_fn = None
                # pylint: disable=protected-access
                func_call = None
                is_func_call = ops.get_default_graph()._is_function(op.type)
                has_out_grads = any(
                    isinstance(g, ops.Tensor) or g for g in out_grads)
                if has_out_grads and (op._id not in stop_ops):
                    if is_func_call:
                        func_call = ops.get_default_graph()._get_function(
                            op.type)
                        grad_fn = func_call.python_grad_func
                        # pylint: enable=protected-access
                    else:
                        # A grad_fn must be defined, either as a function or as None
                        # for ops that do not have gradients.
                        try:
                            grad_fn = ops.get_gradient_function(op)
                        except LookupError:
                            raise LookupError(
                                "No gradient defined for operation '%s' (op type: %s)"
                                % (op.name, op.type))
                if loop_state:
                    loop_state.EnterGradWhileContext(op, before=False)
                if (grad_fn or is_func_call) and has_out_grads:
                    # NOTE: If _AggregatedGrads didn't compute a value for the i'th
                    # output, it means that the cost does not depend on output[i],
                    # therefore dC/doutput[i] is 0.
                    for i, out_grad in enumerate(out_grads):
                        if (not isinstance(out_grad, ops.Tensor)
                                and not out_grad) and (
                                    (not grad_fn and is_func_call)
                                    or _IsTrainable(op.outputs[i])):
                            # Only trainable outputs or outputs for a function call that
                            # will use SymbolicGradient get a zero gradient. Gradient
                            # functions should ignore the gradient for other outputs.
                            # TODO(apassos) gradients of resource handles might be an
                            # issue here because of zeros.
                            if loop_state:
                                out_grads[i] = loop_state.ZerosLike(op, i)
                            else:
                                out_grads[
                                    i] = control_flow_ops.ZerosLikeOutsideLoop(
                                        op, i)
                    with ops.name_scope(op.name + "_grad"):
                        # pylint: disable=protected-access
                        with ops.get_default_graph()._original_op(op):
                            # pylint: enable=protected-access
                            if grad_fn:
                                # If grad_fn was found, do not use SymbolicGradient even for
                                # functions.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: grad_fn(op, *out_grads))
                            else:
                                # For function call ops, we add a 'SymbolicGradient'
                                # node to the graph to compute gradients.
                                in_grads = _MaybeCompile(
                                    grad_scope, op, func_call,
                                    lambda: _SymGrad(op, out_grads))
                            in_grads = _AsList(in_grads)
                            _VerifyGeneratedGradients(in_grads, op)
                            if gate_gradients and len(
                                [x for x in in_grads if x is not None]) > 1:
                                with ops.device(None):
                                    with ops.colocate_with(
                                            None, ignore_existing=True):
                                        in_grads = control_flow_ops.tuple(
                                            in_grads)
                    _LogOpGradients(op, out_grads, in_grads)
                else:
                    # If no grad_fn is defined or none of out_grads is available,
                    # just propagate a list of None backwards.
                    in_grads = [None] * len(op.inputs)
                for i, (t_in, in_grad) in enumerate(zip(op.inputs, in_grads)):
                    if in_grad is not None:
                        if (isinstance(in_grad, ops.Tensor)
                                and t_in.dtype != dtypes.resource):
                            try:
                                in_grad.set_shape(t_in.get_shape())
                            except ValueError:
                                raise ValueError(
                                    "Incompatible shapes between op input and calculated "
                                    "input gradient.  Forward operation: %s.  Input index: %d. "
                                    "Original input shape: %s.  "
                                    "Calculated input gradient shape: %s" %
                                    (op.name, i, t_in.shape, in_grad.shape))
                        _SetGrad(grads, t_in, in_grad)
                if loop_state:
                    loop_state.ExitGradWhileContext(op, before=False)

            # Update pending count for the inputs of op and enqueue ready ops.
            _UpdatePendingAndEnqueueReady(grads, op, queue, pending_count,
                                          loop_state)

    if loop_state:
        loop_state.PostProcessing()
    return [_GetGrad(grads, x) for x in xs]
Example #47
0
def _zero_debias(unbiased_var, value, decay):
  """Compute the delta required for a debiased Variable.

  All exponential moving averages initialized with Tensors are initialized to 0,
  and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
  similarly biased. This function creates the debias updated amount according to
  a scale factor, as in https://arxiv.org/abs/1412.6980.

  To demonstrate the bias the results from 0-initialization, take an EMA that
  was initialized to `0` with decay `b`. After `t` timesteps of seeing the
  constant `c`, the variable have the following value:

  ```
    EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ...
        = c*(1 - b^t)
  ```

  To have the true value `c`, we would divide by the scale factor `1 - b^t`.

  In order to perform debiasing, we use two shadow variables. One keeps track of
  the biased estimate, and the other keeps track of the number of updates that
  have occurred.

  Args:
    unbiased_var: A Variable representing the current value of the unbiased EMA.
    value: A Tensor representing the most recent value.
    decay: A Tensor representing `1-decay` for the EMA.

  Returns:
    The amount that the unbiased variable should be updated. Computing this
    tensor will also update the shadow variables appropriately.
  """
  with variable_scope.variable_scope(
      unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
    with ops.colocate_with(unbiased_var):
      with ops.init_scope():
        biased_initializer = init_ops.zeros_initializer(
            dtype=unbiased_var.dtype)(unbiased_var.get_shape())
        local_step_initializer = init_ops.zeros_initializer()
      def _maybe_get_unique(name):
        """Get name for a unique variable, if not `reuse=True`."""
        if variable_scope.get_variable_scope().reuse:
          return name
        vs_vars = [x.op.name for x in
                   variable_scope.get_variable_scope().global_variables()]
        full_name = variable_scope.get_variable_scope().name + "/" + name
        if full_name not in vs_vars: return name
        idx = 1
        while full_name + ("_%d" % idx) in vs_vars:
          idx += 1
        return name + ("_%d" % idx)
      biased_var = variable_scope.get_variable(
          _maybe_get_unique("biased"), initializer=biased_initializer,
          trainable=False)
      local_step = variable_scope.get_variable(
          _maybe_get_unique("local_step"),
          shape=[],
          dtype=unbiased_var.dtype,
          initializer=local_step_initializer,
          trainable=False)

      # Get an update ops for both shadow variables.
      update_biased = state_ops.assign_sub(biased_var,
                                           (biased_var - value) * decay,
                                           name=scope.name)
      update_local_step = local_step.assign_add(1)

      # Compute the value of the delta to update the unbiased EMA. Make sure to
      # use the new values of the biased variable and the local step.
      with ops.control_dependencies([update_biased, update_local_step]):
        # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
        unbiased_ema_delta = (unbiased_var - biased_var.read_value() /
                              (1 - math_ops.pow(
                                  1.0 - decay, local_step.read_value())))

      return unbiased_ema_delta
Example #48
0
 def _parse_tensor_or_dict(self, features):
     if isinstance(features, dict):
         keys = sorted(features.keys())
         with ops.colocate_with(features[keys[0]]):
             features = array_ops.concat([features[k] for k in keys], 1)
     return features
Example #49
0
 def fn2():
   with ops.colocate_with(b.op):
     c = constant_op.constant(3.0)
     self.assertEqual([b"loc:@a", b"loc:@b"], c.op.colocation_groups())
     return c
Example #50
0
def _GatherV2Grad(op, grad):
    """Gradient for GatherV2 op."""
    # params can be large, so colocate the shape calculation with it.
    #
    # params can be very large for sparse model, array_ops.shape raises
    # exception on the Windows platform when any dimension is larger than
    # int32. params_shape is not used in optimizer apply_sparse gradients,
    # so it's fine to convert it back to int32 regardless of truncation.
    params = op.inputs[0]
    with ops.colocate_with(params):
        params_shape = array_ops.shape(params, out_type=ops.dtypes.int64)
        params_shape = math_ops.cast(params_shape, dtypes.int32)

    indices = op.inputs[1]
    indices_size = array_ops.expand_dims(array_ops.size(indices), 0)
    axis = op.inputs[2]
    axis_static = tensor_util.constant_value(axis)

    # For axis 0 gathers, build an appropriately shaped IndexedSlices.
    if axis_static == 0:
        if context.executing_eagerly():
            params_tail_shape = params_shape.cpu()[1:]
        else:
            params_tail_shape = params_shape[1:]
        values_shape = array_ops.concat([indices_size, params_tail_shape], 0)
        with warnings.catch_warnings():
            warnings.filterwarnings(
                "ignore",
                message="Converting sparse IndexedSlices to a dense Tensor.*")
            values = array_ops.reshape(grad, values_shape)
        indices = array_ops.reshape(indices, indices_size)
        return [ops.IndexedSlices(values, indices, params_shape), None, None]

    outer_shape = params_shape[:axis]
    outer_dims = array_ops.size(outer_shape)
    inner_shape = params_shape[axis:][1:]
    inner_dims = array_ops.size(inner_shape)

    outer_axes_indices = math_ops.range(outer_dims)
    inner_axes_indices = math_ops.range(outer_dims + 1,
                                        outer_dims + 1 + inner_dims)

    values_shape = array_ops.concat([outer_shape, indices_size, inner_shape],
                                    0)
    with warnings.catch_warnings():
        warnings.filterwarnings(
            "ignore",
            message="Converting sparse IndexedSlices to a dense Tensor.*")
        values = array_ops.reshape(grad, values_shape)
    indices = array_ops.reshape(indices, indices_size)

    # We need to sum up every slice `values[..., i, ....]` corresponding to
    # `params[..., indices[i], ...]`. Since `unsorted_segment_sum` does not
    # support an axis parameter, we transpose the gather dimension to the front,
    # then use `unsorted_segment_sum` to build a
    # [gather_axis, outer_axes, inner_axes] tensor with all the gradients
    # affecting each index in `gather_axis` summed up.
    transpose_dims = array_ops.concat(
        [[outer_dims], outer_axes_indices, inner_axes_indices], 0)
    values_transpose = array_ops.transpose(values, transpose_dims)
    num_segments = params_shape[axis]

    params_grad = math_ops.unsorted_segment_sum(values_transpose, indices,
                                                num_segments)

    # Inverts the above transpose by moving dimension 0 back to its original
    # position.
    invert_transpose_dims = array_ops.concat(
        [outer_axes_indices + 1, [0], inner_axes_indices], 0)
    params_grad = array_ops.transpose(params_grad, invert_transpose_dims)
    return [params_grad, None, None]
Example #51
0
 def restore(self, restored_tensors, unused_restored_shapes):
     # pylint: disable=protected-access
     with ops.colocate_with(self.op._table_ref):
         return gen_lookup_ops._lookup_table_import_v2(
             self.op._table_ref, restored_tensors[0],
             restored_tensors[1])
Example #52
0
            def model_fn():
                if num_gpus == 0:
                    last_part_device = 'device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    last_part_device = ('device:GPU:%d' % replica_id)

                a = constant_op.constant(1.0)
                b = constant_op.constant(2.0)
                c = a + b
                self.assertEqual(a.device,
                                 worker_device + '/' + last_part_device)
                self.assertEqual(b.device,
                                 worker_device + '/' + last_part_device)
                self.assertEqual(c.device,
                                 worker_device + '/' + last_part_device)

                # The device scope is ignored for variables but not for normal ops.
                with ops.device('/job:worker/task:0'):
                    x = variable_scope.get_variable(
                        'x',
                        initializer=10.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                    x_add = x.assign_add(c)
                    e = a + c
                # The variable x is on the task 1 since the device_function has been
                # called once before the model_fn.
                self.assertEqual(x.device, '/job:ps/task:1')
                self.assertEqual(x_add.device, x.device)
                self.assertEqual(
                    e.device,
                    '/job:worker/replica:0/task:0/%s' % last_part_device)

                # The colocate_vars_with can override the distribution's device.
                with d.extended.colocate_vars_with(x):
                    y = variable_scope.get_variable(
                        'y',
                        initializer=20.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                # We add an identity here to avoid complaints about summing
                # non-distributed values.
                y_add = y.assign_add(array_ops.identity(x_add))
                self.assertEqual(y.device, '/job:ps/task:1')
                self.assertEqual(y_add.device, y.device)
                self.assertEqual(y.device, x.device)

                z = variable_scope.get_variable(
                    'z',
                    initializer=10.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                self.assertEqual(z.device, '/job:ps/task:0')
                self.assertNotEqual(z.device, x.device)

                with ops.control_dependencies([y_add]):
                    # We add an identity here to avoid complaints about summing
                    # non-distributed values.
                    z_add = z.assign_add(array_ops.identity(y))
                with ops.control_dependencies([z_add]):
                    f = z + c
                self.assertEqual(f.device,
                                 worker_device + '/' + last_part_device)

                # The device scope would merge with the default worker device.
                with ops.device('/CPU:1'):
                    g = e + 1.0
                self.assertEqual(g.device, worker_device + '/device:CPU:1')

                # Ths ops.colocate_with will be ignored when defining a variale but not
                # for a normal tensor.
                with ops.colocate_with(x):
                    u = variable_scope.get_variable('u', initializer=30.0)
                    v = variable_scope.get_variable('v', initializer=30.0)
                    h = f + 1.0
                self.assertIn('/job:ps/', u.device)
                self.assertIn('/job:ps/', v.device)
                # u and v are on different parameter servers.
                self.assertTrue(u.device != x.device or v.device != x.device)
                self.assertTrue(u.device == x.device or v.device == x.device)
                # Here h is not on one worker. Note h.device is canonical while x.device
                # is not but.
                self.assertIn('/job:ps/', h.device)
                return y_add, z_add, f
Example #53
0
    def minimize(self, global_step=None, name=None):
        """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
        # Technically, the op depends on a lot more than the variables,
        # but we'll keep the list short.
        with name_scope(name, 'sdca/minimize'):
            sparse_example_indices = []
            sparse_feature_indices = []
            sparse_features_values = []
            for sf in self._examples['sparse_features']:
                sparse_example_indices.append(sf.example_indices)
                sparse_feature_indices.append(sf.feature_indices)
                # If feature values are missing, sdca assumes a value of 1.0f.
                if sf.feature_values is not None:
                    sparse_features_values.append(sf.feature_values)

            # pylint: disable=protected-access
            example_ids_hashed = gen_sdca_ops.sdca_fprint(
                internal_convert_to_tensor(self._examples['example_ids']))
            # pylint: enable=protected-access
            example_state_data = self._hashtable.lookup(example_ids_hashed)
            # Solver returns example_state_update, new delta sparse_feature_weights
            # and delta dense_feature_weights.

            sparse_weights = []
            sparse_indices = []
            # If we have partitioned variables, keep a few dictionaries of Tensors
            # around that we need for the assign_add after the op call to
            # gen_sdca_ops.sdca_optimizer().  These are keyed because we may have a
            # mix of partitioned and un-partitioned variables.
            num_partitions_by_var = {}
            p_assignments_by_var = {}
            gather_ids_by_var = {}
            for v_num, (w, i) in enumerate(
                    zip(self._slots['unshrinked_sparse_features_weights'],
                        sparse_feature_indices)):
                # Append the sparse_indices (in full-variable space).
                sparse_idx = math_ops.cast(
                    array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
                    dtypes.int64)
                sparse_indices.append(sparse_idx)
                if isinstance(w, list) or isinstance(
                        w, var_ops.PartitionedVariable):
                    num_partitions = len(w)
                    flat_ids = array_ops.reshape(sparse_idx, [-1])
                    # We use div partitioning, which is easiest to support downstream.
                    # Compute num_total_ids as the sum of dim-0 of w, then assign
                    # to partitions based on a constant number of ids per partition.
                    # Optimize if we already know the full shape statically.
                    dim_0_size = self._get_first_dimension_size_statically(
                        w, num_partitions)

                    if dim_0_size.value:
                        num_total_ids = constant_op.constant(
                            dim_0_size.value, flat_ids.dtype)
                    else:
                        dim_0_sizes = []
                        for p in range(num_partitions):
                            if w[p].get_shape()[0].value is not None:
                                dim_0_sizes.append(w[p].get_shape()[0].value)
                            else:
                                with ops.colocate_with(w[p]):
                                    dim_0_sizes.append(
                                        array_ops.shape(w[p])[0])
                        num_total_ids = math_ops.reduce_sum(
                            math_ops.cast(array_ops.stack(dim_0_sizes),
                                          flat_ids.dtype))
                    ids_per_partition = num_total_ids // num_partitions
                    extras = num_total_ids % num_partitions

                    p_assignments = math_ops.maximum(
                        flat_ids // (ids_per_partition + 1),
                        (flat_ids - extras) // ids_per_partition)

                    # Emulate a conditional using a boolean indicator tensor
                    new_ids = array_ops.where(
                        p_assignments < extras,
                        flat_ids % (ids_per_partition + 1),
                        (flat_ids - extras) % ids_per_partition)

                    # Cast partition assignments to int32 for use in dynamic_partition.
                    # There really should not be more than 2^32 partitions.
                    p_assignments = math_ops.cast(p_assignments, dtypes.int32)
                    # Partition list of ids based on assignments into num_partitions
                    # separate lists.
                    gather_ids = data_flow_ops.dynamic_partition(
                        new_ids, p_assignments, num_partitions)
                    # Add these into the dictionaries for use in the later update.
                    num_partitions_by_var[v_num] = num_partitions
                    p_assignments_by_var[v_num] = p_assignments
                    gather_ids_by_var[v_num] = gather_ids

                    # Gather the weights from each partition.
                    partition_gathered_weights = []
                    for p in range(num_partitions):
                        with ops.colocate_with(w[p]):
                            partition_gathered_weights.append(
                                array_ops.gather(w[p], gather_ids[p]))

                    # Stitch the weights back together in the same order they were before
                    # we dynamic_partitioned them.
                    condition_indices = data_flow_ops.dynamic_partition(
                        math_ops.range(array_ops.shape(new_ids)[0]),
                        p_assignments, num_partitions)
                    batch_gathered_weights = data_flow_ops.dynamic_stitch(
                        condition_indices, partition_gathered_weights)
                else:
                    w_as_tensor = internal_convert_to_tensor(w)
                    with ops.device(w_as_tensor.device):
                        batch_gathered_weights = array_ops.gather(
                            w_as_tensor, sparse_idx)
                sparse_weights.append(batch_gathered_weights)

            # pylint: disable=protected-access
            if compat.forward_compatible(year=2018, month=10, day=30):
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptive=self._adaptive())
            else:
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptative=self._adaptive())
            # pylint: enable=protected-access

            with ops.control_dependencies([esu]):
                update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
                # Update the weights before the proximal step.
                for v_num, (w, i, u) in enumerate(
                        zip(self._slots['unshrinked_sparse_features_weights'],
                            sparse_indices, sfw)):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        update_ops += self._get_partitioned_update_ops(
                            v_num, num_partitions_by_var, p_assignments_by_var,
                            gather_ids_by_var, w, u, p_assignments,
                            num_partitions)
                    else:
                        update_ops.append(state_ops.scatter_add(w, i, u))
                for w, u in zip(
                        self._slots['unshrinked_dense_features_weights'], dfw):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        split_updates = array_ops.split(
                            u,
                            num_or_size_splits=[
                                v.shape.as_list()[0] for v in w
                            ])
                        for v, split_update in zip(w, split_updates):
                            update_ops.append(
                                state_ops.assign_add(v, split_update))
                    else:
                        update_ops.append(state_ops.assign_add(w, u))
            if not global_step:
                return control_flow_ops.group(*update_ops)
            with ops.control_dependencies(update_ops):
                return state_ops.assign_add(global_step, 1, name=name).op
Example #54
0
            def model_fn():
                if 'CPU' in compute_device:
                    replica_compute_device = '/device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    replica_compute_device = ('/device:GPU:%d' % replica_id)
                replica_compute_device = device_util.canonicalize(
                    replica_compute_device)

                if 'CPU' in variable_device:
                    replica_variable_device = '/device:CPU:0'
                else:
                    replica_id = _get_replica_id_integer()
                    replica_variable_device = ('/device:GPU:%d' % replica_id)
                replica_variable_device = device_util.canonicalize(
                    replica_variable_device)

                a = constant_op.constant(1.0)
                b = constant_op.constant(2.0)
                c = a + b
                self.assertEqual(a.device, replica_compute_device)
                self.assertEqual(b.device, replica_compute_device)
                self.assertEqual(c.device, replica_compute_device)

                # The device scope is ignored for variables but not for normal ops.
                with ops.device('/device:GPU:2'):
                    x = variable_scope.get_variable(
                        'x',
                        initializer=10.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                    x_add = x.assign_add(c)
                    e = a + c
                self.assertEqual(device_util.canonicalize(x.device),
                                 replica_variable_device)
                self.assertEqual(x_add.device, x.device)
                self.assertEqual(e.device,
                                 device_util.canonicalize('/device:GPU:2'))

                # The colocate_vars_with can override the distribution's device.
                with d.extended.colocate_vars_with(x):
                    y = variable_scope.get_variable(
                        'y',
                        initializer=20.0,
                        aggregation=variable_scope.VariableAggregation.SUM)
                # We add an identity here to avoid complaints about summing
                # non-distributed values.
                y_add = y.assign_add(array_ops.identity(x_add))
                self.assertEqual(device_util.canonicalize(y.device),
                                 replica_variable_device)
                self.assertEqual(y_add.device, y.device)
                self.assertEqual(y.device, x.device)

                z = variable_scope.get_variable(
                    'z',
                    initializer=10.0,
                    aggregation=variable_scope.VariableAggregation.SUM)
                self.assertEqual(device_util.canonicalize(z.device),
                                 replica_variable_device)

                with ops.control_dependencies([y_add]):
                    # We add an identity here to avoid complaints about summing
                    # non-distributed values.
                    z_add = z.assign_add(array_ops.identity(y))
                with ops.control_dependencies([z_add]):
                    f = z + c
                self.assertEqual(f.device, replica_compute_device)

                # The device scope would merge with the default worker device.
                with ops.device('/CPU:1'):
                    g = e + 1.0
                self.assertEqual(g.device,
                                 device_util.canonicalize('/device:CPU:1'))

                # Ths ops.colocate_with will be ignored when defining a variale but not
                # for a normal tensor.
                with ops.colocate_with(x):
                    u = variable_scope.get_variable('u', initializer=30.0)
                    h = f + 1.0
                self.assertEqual(device_util.canonicalize(u.device),
                                 replica_variable_device)
                self.assertEqual(device_util.canonicalize(x.device),
                                 device_util.canonicalize(h.device))
                return y_add, z_add, f
    def apply_gradients(self, grads_and_vars, global_step=None, name=None):
        """Apply gradients to variables.

    This contains most of the synchronization implementation and also wraps the
    apply_gradients() from the real optimizer.

    Args:
      grads_and_vars: List of (gradient, variable) pairs as returned by
        compute_gradients().
      global_step: Optional Variable to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.  Default to the
        name passed to the Optimizer constructor.

    Returns:
      train_op: The op to dequeue a token so the replicas can exit this batch
      and start the next one. This is executed by each replica.

    Raises:
      ValueError: If the grads_and_vars is empty.
      ValueError: If global step is not provided, the staleness cannot be
        checked.
    """
        if not grads_and_vars:
            raise ValueError("Must supply at least one variable")

        if global_step is None:
            raise ValueError("Global step is required to check staleness")

        self._global_step = global_step
        train_ops = []
        aggregated_grad = []
        var_list = []

        # local_anchor op will be placed on this worker task by default.
        local_anchor = control_flow_ops.no_op()
        # Colocating local_step variable prevents it being placed on the PS.
        with ops.colocate_with(local_anchor):
            self._local_step = variable_scope.variable(
                initial_value=0,
                trainable=False,
                collections=[ops.GraphKeys.LOCAL_VARIABLES],
                dtype=global_step.dtype.base_dtype,
                name="sync_rep_local_step")

        self.local_step_init_op = state_ops.assign(self._local_step,
                                                   global_step)
        chief_init_ops = [self.local_step_init_op]
        self.ready_for_local_init_op = variables.report_uninitialized_variables(
            variables.global_variables())

        with ops.name_scope(None, self._name):
            for grad, var in grads_and_vars:
                var_list.append(var)
                with ops.device(var.device):
                    # Dense gradients.
                    if grad is None:
                        aggregated_grad.append(None)  # pass-through.
                        continue
                    elif isinstance(grad, ops.Tensor):
                        grad_accum = data_flow_ops.ConditionalAccumulator(
                            grad.dtype,
                            shape=var.get_shape(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_grad(grad,
                                                  local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_grad(self._replicas_to_aggregate))
                    else:
                        if not isinstance(grad, ops.IndexedSlices):
                            raise ValueError("Unknown grad type!")
                        grad_accum = data_flow_ops.SparseConditionalAccumulator(
                            grad.dtype,
                            shape=(),
                            shared_name=var.name + "/grad_accum")
                        train_ops.append(
                            grad_accum.apply_indexed_slices_grad(
                                grad, local_step=self._local_step))
                        aggregated_grad.append(
                            grad_accum.take_indexed_slices_grad(
                                self._replicas_to_aggregate))

                    self._accumulator_list.append((grad_accum, var.device))

            aggregated_grads_and_vars = zip(aggregated_grad, var_list)

            # sync_op will be assigned to the same device as the global step.
            with ops.device(global_step.device), ops.name_scope(""):
                update_op = self._opt.apply_gradients(
                    aggregated_grads_and_vars, global_step)

            # Create token queue.
            with ops.device(global_step.device), ops.name_scope(""):
                sync_token_queue = (data_flow_ops.FIFOQueue(
                    -1,
                    global_step.dtype.base_dtype,
                    shapes=(),
                    name="sync_token_q",
                    shared_name="sync_token_q"))
                self._sync_token_queue = sync_token_queue

                # dummy_queue is passed to the queue runner. Don't use the real queues
                # because the queue runner doesn't automatically reopen it once it
                # closed queues in PS devices.
                dummy_queue = (data_flow_ops.FIFOQueue(
                    1,
                    types_pb2.DT_INT32,
                    shapes=(),
                    name="dummy_queue",
                    shared_name="dummy_queue"))

            with ops.device(global_step.device), ops.name_scope(""):
                # Replicas have to wait until they can get a token from the token queue.
                with ops.control_dependencies(train_ops):
                    token = sync_token_queue.dequeue()
                train_op = state_ops.assign(self._local_step, token)

                with ops.control_dependencies([update_op]):
                    # Sync_op needs to insert tokens to the token queue at the end of the
                    # step so the replicas can fetch them to start the next step.
                    tokens = array_ops.fill([self._tokens_per_step],
                                            global_step)
                    sync_op = sync_token_queue.enqueue_many((tokens, ))

                if self._variable_averages is not None:
                    with ops.control_dependencies([sync_op
                                                   ]), ops.name_scope(""):
                        sync_op = self._variable_averages.apply(
                            self._variables_to_average)

                self._chief_queue_runner = queue_runner.QueueRunner(
                    dummy_queue, [sync_op])
            for accum, dev in self._accumulator_list:
                with ops.device(dev):
                    chief_init_ops.append(
                        accum.set_global_step(global_step,
                                              name="SetGlobalStep"))
            self.chief_init_op = control_flow_ops.group(*(chief_init_ops))
            self._gradients_applied = True
            return train_op
Example #56
0
    def __init__(
            self,  # pylint: disable=super-init-not-called
            initial_value=None,
            trainable=None,
            caching_device=None,
            name=None,
            dtype=None,
            constraint=None,
            add_initializers_to=None,
            lifted_initializer_graph=None,
            **unused_kwargs):
        """Creates a variable.

    Args:
      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
        which is the initial value for the Variable. The initial value must have
        a shape specified unless `validate_shape` is set to False. Can also be a
        callable with no argument that returns the initial value when called.
        (Note that initializer functions from init_ops.py must first be bound
         to a shape before being used here.)
      trainable: If `True`, GradientTapes automatically watch uses of this
        Variable.
      caching_device: Optional device string or function describing where the
        Variable should be cached for reading.  Defaults to the Variable's
        device.  If not `None`, caches on another device.  Typical use is to
        cache on the device where the Ops using the Variable reside, to
        deduplicate copying through `Switch` and other conditional statements.
      name: Optional name for the variable. Defaults to `'Variable'` and gets
        uniquified automatically.
      dtype: If set, initial_value will be converted to the given type.
        If None, either the datatype will be kept (if initial_value is
       a Tensor) or float32 will be used (if it is a Python object convertible
       to a Tensor).
      constraint: An optional projection function to be applied to the variable
        after being updated by an `Optimizer` (e.g. used to implement norm
        constraints or value constraints for layer weights). The function must
        take as input the unprojected Tensor representing the value of the
        variable and return the Tensor for the projected value
        (which must have the same shape). Constraints are not safe to
        use when doing asynchronous distributed training.
      add_initializers_to: if not None and not in legacy graph mode, the
        initializer tensor will be added to this map in addition to adding the
        assignment to the function.
      lifted_initializer_graph: FuncGraph to try to lift initializers to.

    Raises:
      ValueError: If the initial value is not specified, or does not have a
        shape and `validate_shape` is `True`.
      RuntimeError: If called outside of a function definition.
    """
        if context.executing_eagerly():
            # If we've been init_scope()d out of the function definition nothing to do
            # here; we can't really do the capturing or conditional logic.
            resource_variable_ops.ResourceVariable.__init__(
                self,
                initial_value=initial_value,
                trainable=trainable,
                caching_device=caching_device,
                name=name,
                dtype=dtype,
                constraint=constraint)
            return
        with ops.init_scope():
            self._in_graph_mode = not context.executing_eagerly()
        if initial_value is None:
            raise ValueError("initial_value must be specified.")
        init_from_fn = callable(initial_value)

        if constraint is not None and not callable(constraint):
            raise ValueError("The `constraint` argument must be a callable.")

        if isinstance(initial_value, checkpointable.CheckpointInitialValue):
            self._maybe_initialize_checkpointable()
            self._update_uid = initial_value.checkpoint_position.restore_uid
            initial_value = initial_value.wrapped_value

        if trainable is None:
            trainable = True
        self._trainable = trainable
        self._save_slice_info = None
        self._initial_value = None
        self._initializer_op = None
        self._is_initialized_op = None
        self._graph_element = None
        self._cached_value = None
        # Store the graph key so optimizers know how to only retrieve variables from
        # this graph. Guaranteed to be the same as the eager graph_key.
        self._graph_key = ops.get_default_graph()._graph_key  # pylint: disable=protected-access
        with ops.name_scope(name, "Variable",
                            [] if init_from_fn else [initial_value]) as name:
            # pylint: disable=protected-access
            with ops.init_scope():
                handle_name = ops._name_from_scope_name(name)
                unique_id = "%s_%d" % (handle_name, ops.uid())
                shared_name = context.shared_name(unique_id)
            with ops.name_scope("Initializer"), ops.device(None):
                initial_value = ops.convert_to_tensor(
                    initial_value() if init_from_fn else initial_value,
                    name="initial_value",
                    dtype=dtype)
            with ops.init_scope():
                self._handle = resource_variable_ops.eager_safe_variable_handle(
                    initial_value=initial_value,
                    shared_name=shared_name,
                    name=name,
                    graph_mode=self._in_graph_mode)
            self._shape = initial_value.shape
            self._unique_id = unique_id
            self._handle_name = handle_name + ":0"
            self._dtype = initial_value.dtype.base_dtype
            self._constraint = constraint
            assert initial_value is not None
            if self._in_graph_mode:
                with ops.init_scope():
                    outer_graph = ops.get_default_graph()
                lifted_initializer = lift_to_graph.lift_to_graph(
                    initial_value, outer_graph)[initial_value]
                with ops.init_scope():
                    self._initial_value = lifted_initializer
                    with ops.name_scope("IsInitialized"):
                        self._is_initialized_op = (
                            resource_variable_ops.var_is_initialized_op(
                                self._handle))
                    if initial_value is not None:
                        with ops.name_scope("Assign") as n, ops.colocate_with(
                                self._handle):
                            self._initializer_op = resource_variable_ops.assign_variable_op(
                                self._handle, lifted_initializer, name=n)
                    with ops.name_scope("Read"), ops.colocate_with(
                            self._handle):
                        # Manually assign reads to the handle's device to avoid log
                        # messages.
                        with ops.device(self._handle.device):
                            value = self._read_variable_op()
                        self._graph_element = value
                    ops.add_to_collection(ops.GraphKeys.GLOBAL_VARIABLES, self)
            else:
                if add_initializers_to is not None:
                    add_initializers_to[self] = initial_value

                def assign_fn():
                    with ops.name_scope("Assign") as n, ops.colocate_with(
                            self._handle):
                        resource_variable_ops.assign_variable_op(self._handle,
                                                                 initial_value,
                                                                 name=n)
                        # Returning values to keep tf.cond happy.
                    return ops.convert_to_tensor(1)

                def not_assign_fn():
                    return ops.convert_to_tensor(0)

                # Note: this cond is always guaranteed to run because we're inside a
                # defun which will insert automatic control dependencies.
                control_flow_ops.cond(
                    resource_variable_ops.var_is_initialized_op(self._handle),
                    not_assign_fn, assign_fn)

        # After the handle has been created, set up a way to clean it up when
        # executing eagerly. We'll hold the only reference to the deleter, so that
        # when this object is garbage collected the deleter will be too. This
        # means ResourceVariables can be part of reference cycles without those
        # cycles being uncollectable.
        if not self._in_graph_mode:
            self._handle_deleter = resource_variable_ops.EagerResourceDeleter(
                handle=self._handle, handle_device=self._handle.device)
        self._cached_shape_as_list = None
Example #57
0
  def _create_variable(self, next_creator, **kwargs):
    """Implements StrategyExtendedV2._create_variable.

    Creates a `Variable` or a `ShardedVariable`. A `ShardedVariable` will be
    created if satisfying all the following criteria:
      1. `self._variable_partitioner` results in more than one partition on the
         first axis.
      2. variable's rank is greater than 0.
      3. variable is not colocated with another variable.
    Otherwise a `Variable` will be created.

    Args:
      next_creator: See `variable_scope.variable_creator_scope`; the next
        creator in the chain.
      **kwargs: Passed through to the next creator.

    Returns:
      A `Variable` or `ShardedVariable`.
    """

    var_creator = self._create_var_creator(next_creator, **kwargs)
    if "colocate_with" in kwargs:  # Never partition colocated_with variables.
      colocate_with = kwargs["colocate_with"]
      # Clear the variable scope to avoid possible conflicts between device
      # scope and colocation scope.
      with ops.device(None):
        with ops.colocate_with(colocate_with):
          var = var_creator(**kwargs)
          logging.debug(
              "Creating variable (name:%s, shape:%r) that colocates with %s",
              var.name, var.shape, kwargs["colocate_with"].name)
          return var

    if self._variable_partitioner is None:
      return self._create_variable_round_robin(var_creator, **kwargs)

    name = kwargs.get("name", None)
    dtype = kwargs.get("dtype", None)
    shape = kwargs.get("shape", None)
    initial_value = kwargs.get("initial_value", None)
    if initial_value is None:
      # If we are loading, next_creator will return an UninitializedVariable
      v = next_creator(**kwargs)
      if not isinstance(v, resource_variable_ops.UninitializedVariable):
        raise ValueError(
            "It looks like you are using `ParameterServerStrategy` with a "
            "`variable_partitioner`, and trying to create a variable without "
            "specifying `initial_value`. This is not allowed. Please specify the "
            "`initial_value`.")
      elif shape is None or dtype is None:
        raise ValueError(
            "It looks like you are trying to load a `SavedModel` using "
            "`tf.saved_model.load` within a `ParameterServerStrategy` scope, "
            "but the `SavedModel` is missing shape or dtype information.")
      else:
        def initializer(shape, dtype, **kwargs):
          if "partition_shape" in kwargs:
            shape = kwargs["partition_shape"]
          return array_ops.zeros(shape, dtype)
        initial_value = functools.partial(initializer, shape=shape, dtype=dtype)

    # Two cases where initial_value can be a callable:
    #   1. initial_value is passed as a callable, e.g, an `initializer` class.
    #   2. restoring from checkpoint, initial_value is a
    #     "CheckpointInitialValueCallable".
    init_from_fn = callable(initial_value)

    if init_from_fn and (shape is None or dtype is None):
      init_from_fn = False
      initial_value = initial_value()
    if not init_from_fn:
      # The initial_value is created on coordinator, it will need to be sent to
      # ps for variable initialization, which can be inefficient and can
      # potentially hit the 2GB limit on protobuf serialization.
      initial_value = ops.convert_to_tensor(initial_value, dtype=dtype)
      dtype = initial_value.dtype
      shape = initial_value.shape
    else:
      shape = tensor_shape.as_shape(shape)

    if shape.rank == 0:  # Skip partitioning rank-0 variable.
      return self._create_variable_round_robin(var_creator, **kwargs)

    num_partitions = self._variable_partitioner(shape=shape, dtype=dtype)
    if not num_partitions or num_partitions[0] == 0 or any(
        v != 1 for v in num_partitions[1:]):
      raise ValueError(
          "variable_partitioner must return a list/tuple whose elements are 1"
          " besides the first element (non-zero), got: %r" % num_partitions)

    if num_partitions[0] == 1:  # no partition
      return self._create_variable_round_robin(var_creator, **kwargs)

    # Use "div" partition strategy to partition the variable.
    num_partitions = min(num_partitions[0], shape[0])
    base = shape[0] // num_partitions
    extra = shape[0] % num_partitions
    # An example: num_partitions=4, shape[0]=10, partitions: [3, 3, 2, 2]
    # offsets: [0, 3, 6, 8, 10]
    offsets = []
    for i in range(num_partitions):
      if i == 0:
        offsets.append(0)
      else:
        prev_shard_size = base + (1 if i - 1 < extra else 0)
        offsets.append(offsets[i - 1] + prev_shard_size)
    offsets.append(shape[0])

    def init_shard_fn(shard_index):
      if not init_from_fn:
        logging.log_if(
            logging.WARN, _INEFFICIENT_INIT_WARNING % name, shard_index == 0 and
            shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
        return initial_value[offsets[shard_index]:offsets[shard_index + 1]]
      partition_shape = (offsets[shard_index + 1] -
                         offsets[shard_index],) + shape[1:]
      partition_offset = (offsets[shard_index],) + (0,) * len(shape[1:])
      arg_spec = tf_inspect.getfullargspec(initial_value)
      if ("shard_info" not in arg_spec.args and
          "shard_info" not in arg_spec.kwonlyargs):
        try:
          value = initial_value(
              partition_shape=partition_shape,
              partition_offset=partition_offset)
        except (TypeError, ValueError):
          # TypeError: Initializer doesn't accept kwargs
          # ValueError: Initializer doesn't accept partition kwargs
          # In both cases we go ahead creating the full value and then slice.
          value = initial_value()

        if value.shape == partition_shape:
          # Initializer supports partition: value is the partition value.
          return value
        else:
          # Initializer doesn't support partition: value is the full value
          # and needs to be sliced to get the partition value.
          logging.log_if(
              logging.WARN, _INEFFICIENT_INIT_WARNING % name,
              shard_index == 0 and
              shape.num_elements() > _LARGE_VARIABLE_NUM_ELEMENTS)
          return value[offsets[shard_index]:offsets[shard_index + 1]]
      else:
        # For compatibility with `CheckpointInitialValueCallable`.
        return initial_value(
            shard_info=trackable.ShardInfo(
                shape=tensor_shape.as_shape(partition_shape),
                offset=partition_offset))

    var_list = []
    for i in range(num_partitions):
      kwargs["shape"] = (offsets[i + 1] - offsets[i],) + shape[1:]
      kwargs["initial_value"] = lambda: init_shard_fn(i)
      if name is not None:
        kwargs["name"] = "{}/part_{}".format(name, i)
      var_list.append(self._create_variable_round_robin(var_creator, **kwargs))

    result = sharded_variable.ShardedVariable(var_list)
    return result
def _stitch(values, indices):
  if len(values) == 1:
    return values[0]
  with ops.colocate_with(indices[0], ignore_existing=True):
    all_values = data_flow_ops.dynamic_stitch(indices, values)
  return all_values
  def __init__(self,
               initial_value=None,
               trainable=None,
               caching_device=None,
               name=None,
               dtype=None,
               constraint=None,
               add_initializers_to=None,
               lifted_initializer_graph=None,
               synchronization=None,
               aggregation=None,
               shape=None,
               **unused_kwargs):
    """Creates a variable.

    Args:
      initial_value: A `Tensor`, or Python object convertible to a `Tensor`,
        which is the initial value for the Variable. The initial value must have
        a shape specified unless `validate_shape` is set to False. Can also be a
        callable with no argument that returns the initial value when called.
        (Note that initializer functions from init_ops.py must first be bound
         to a shape before being used here.)
      trainable: If `True`, GradientTapes automatically watch uses of this
        Variable.
      caching_device: Optional device string or function describing where the
        Variable should be cached for reading.  Defaults to the Variable's
        device.  If not `None`, caches on another device.  Typical use is to
        cache on the device where the Ops using the Variable reside, to
        deduplicate copying through `Switch` and other conditional statements.
      name: Optional name for the variable. Defaults to `'Variable'` and gets
        uniquified automatically.
      dtype: If set, initial_value will be converted to the given type.
        If None, either the datatype will be kept (if initial_value is
       a Tensor) or float32 will be used (if it is a Python object convertible
       to a Tensor).
      constraint: An optional projection function to be applied to the variable
        after being updated by an `Optimizer` (e.g. used to implement norm
        constraints or value constraints for layer weights). The function must
        take as input the unprojected Tensor representing the value of the
        variable and return the Tensor for the projected value
        (which must have the same shape). Constraints are not safe to
        use when doing asynchronous distributed training.
      add_initializers_to: if not None and not in legacy graph mode, the
        initializer tensor will be added to this map in addition to adding the
        assignment to the function.
      lifted_initializer_graph: FuncGraph to try to lift initializers to.
      synchronization: Indicates when a distributed a variable will be
        aggregated. Accepted values are constants defined in the class
        `tf.VariableSynchronization`. By default the synchronization is set to
        `AUTO` and the current `DistributionStrategy` chooses
        when to synchronize.
      aggregation: Indicates how a distributed variable will be aggregated.
        Accepted values are constants defined in the class
        `tf.VariableAggregation`.
      shape: (optional) The shape of this variable. If None, the shape of
        `initial_value` will be used. When setting this argument to
        `tf.TensorShape(None)` (representing an unspecified shape), the variable
        can be assigned with values of different shapes.

    Raises:
      ValueError: If the initial value is not specified, or does not have a
        shape and `validate_shape` is `True`.
      RuntimeError: If called outside of a function definition.
    """
    if not ops.inside_function():
      # If we've been init_scope()d out of the function definition nothing to do
      # here; we can't really do the capturing or conditional logic.
      resource_variable_ops.ResourceVariable.__init__(
          self, initial_value=initial_value, trainable=trainable,
          caching_device=caching_device, name=name, dtype=dtype,
          constraint=constraint)
      return
    if initial_value is None:
      raise ValueError("initial_value must be specified.")
    init_from_fn = callable(initial_value)

    if constraint is not None and not callable(constraint):
      raise ValueError("The `constraint` argument must be a callable.")

    if isinstance(initial_value, trackable.CheckpointInitialValue):
      self._maybe_initialize_trackable()
      self._update_uid = initial_value.checkpoint_position.restore_uid
      initial_value = initial_value.wrapped_value

    with ops.name_scope(name, "Variable", []
                        if init_from_fn else [initial_value]) as scope_name:
      with ops.name_scope("Initializer"), ops.device(None):
        initial_value = ops.convert_to_tensor(
            initial_value() if init_from_fn else initial_value,
            name="initial_value", dtype=dtype)
      assert initial_value is not None

      # Don't use `shape or initial_value.shape` since TensorShape has
      # overridden `__bool__`.
      if shape is None:
        shape = initial_value.shape

    # Use the constructor for UninitializedVariable to start. Outside the name
    # scope so we don't double up the prefix.
    super(UnliftedInitializerVariable, self).__init__(
        trainable=trainable,
        caching_device=caching_device,
        name=name,
        shape=shape,
        dtype=initial_value.dtype,
        constraint=constraint,
        synchronization=synchronization,
        aggregation=aggregation,
        extra_handle_data=initial_value,
        **unused_kwargs)

    with ops.name_scope(scope_name):
      if self._in_graph_mode:
        with ops.init_scope():
          outer_graph = ops.get_default_graph()
        func_graph = ops.get_default_graph()
        function_placeholders = (
            func_graph.inputs + func_graph.internal_captures)
        placeholder_ops = set(
            [tensor.op for tensor in function_placeholders])
        lifted_initializer = lift_to_graph.lift_to_graph(
            [initial_value], outer_graph,
            disallowed_placeholders=placeholder_ops)[initial_value]
        with ops.init_scope():
          self._initial_value = lifted_initializer
          with ops.name_scope("IsInitialized"):
            self._is_initialized_op = (
                resource_variable_ops.var_is_initialized_op(self._handle))
          if initial_value is not None:
            with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
              self._initializer_op = resource_variable_ops.assign_variable_op(
                  self._handle, lifted_initializer, name=n)
      else:
        if add_initializers_to is not None:
          add_initializers_to[self] = initial_value
        def assign_fn():
          with ops.name_scope("Assign") as n, ops.colocate_with(self._handle):
            resource_variable_ops.assign_variable_op(
                self._handle,
                initial_value,
                name=n)
            # Returning values to keep tf.cond happy.
          return ops.convert_to_tensor(1)
        def not_assign_fn():
          return ops.convert_to_tensor(0)
        # Note: this cond is always guaranteed to run because we're inside a
        # defun which will insert automatic control dependencies.
        control_flow_ops.cond(
            resource_variable_ops.var_is_initialized_op(self._handle),
            not_assign_fn, assign_fn)
Example #60
0
    def pack(self, grouped_grads_and_vars):
        """Pack tensors."""
        self.grouped_grads_and_vars = grouped_grads_and_vars
        self.all_tower_shapes = []
        self.all_tower_sizes = []

        device_grad_packs = []
        for tower_grads_and_vars in grouped_grads_and_vars:
            with ops.colocate_with(tower_grads_and_vars[0][0]):
                # Flatten all the grads.
                flat_grads = [
                    array_ops.reshape(g, [-1]) for g, _ in tower_grads_and_vars
                ]
                # Remember the original shape of all the grads.
                tower_shapes = [
                    array_ops.shape(g) for g, _ in tower_grads_and_vars
                ]
                # Remember the original sizes of all the grads.
                tower_sizes = [
                    array_ops.size(g) for g, _ in tower_grads_and_vars
                ]
                # Concat all the flat grads into a big flat tensor.
                concat_grads = array_ops.concat(flat_grads, 0)

                # Split the big tensor into num_splits packs. In cases where the
                # total size is not divisible num_splits, the last pack gets
                # more elements.
                # TODO(zhengxq): it is also possible to optimize away all the concat
                # as well.
                num_splits = self.num_packs

                # The array_ops.size function will sometimes remove static shapes. So if
                # all gradient shapes are defined, we use another method to get the
                # total size.
                # TODO(yuefengz): move this logic to array_ops.size.
                if all([
                        g.shape.is_fully_defined()
                        for g, _ in tower_grads_and_vars
                ]):
                    total_grad_size = sum([
                        g.shape.num_elements() for g, _ in tower_grads_and_vars
                    ])
                else:
                    total_grad_size = array_ops.size(concat_grads)

                split_size = total_grad_size // num_splits
                split_size_last = total_grad_size - split_size * (num_splits -
                                                                  1)
                split_sizes = [split_size] * (num_splits - 1) + [
                    split_size_last
                ]
                grad_packs = array_ops.split(concat_grads, split_sizes)

                # Ready to aggregate the repacked gradients, with fake variables.
                # TODO(zhengxq): It is hacky to have to use fake variables.
                # We should remove the need for variables in
                # aggregate_gradients_using*.
                device_grad_packs.append(zip(grad_packs, [None] * num_splits))
                self.all_tower_shapes.append(tower_shapes)
                self.all_tower_sizes.append(tower_sizes)

        return device_grad_packs