Exemple #1
0
 def _apply_sparse(self, grad, var):
   beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
   beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
   lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
   beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
   beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
   epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
   lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
   # m_t = beta1 * m + (1 - beta1) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = grad.values * (1 - beta1_t)
   m_t = state_ops.assign(m, m * beta1_t,
                          use_locking=self._use_locking)
   m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)
   # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
   v = self.get_slot(var, "v")
   v_scaled_g_values = (grad.values * grad.values) * (1 - beta2_t)
   v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
   v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values,
                               use_locking=self._use_locking)
   v_sqrt = math_ops.sqrt(v_t)
   var_update = state_ops.assign_sub(var,
                                     lr * m_t / (v_sqrt + epsilon_t),
                                     use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #2
0
 def _sparse_moving_average(self, x_tm1, idxs, b_t_, name, beta=.9):
   """
   Creates a moving average for a sparse variable.
   Inputs:
     x_tm1: the associated parameter (e.g. a weight matrix)
     idxs: the tensor representing the indices used
     b_t_: the value to accumulate (e.g. slices of the gradient)
     name: a string to use to retrieve it later (e.g. 'm')
     beta: the decay factor (defaults to .9)
   Outputs:
     a_t: the average after moving (same shape as x_tm1, not b_t_)
     t: the internal timestep (used to correct initialization bias)
   """
   
   a_tm1 = self._zeros_slot(x_tm1, '%s' % name, self._name)
   a_tm1_ = array_ops.gather(a_tm1, idxs)
   tm1 = self._zeros_idx_slot(x_tm1, '%s/tm1' % name, self._name)
   tm1_ = array_ops.gather(tm1, idxs)
   t = state_ops.scatter_add(tm1, idxs, tm1_*0+1, use_locking=self._use_locking)
   t_ = array_ops.gather(t, idxs)
   if beta < 1:
     beta_t = ops.convert_to_tensor(beta, name='%s/decay' % name)
     beta_t_ = beta_t * (1-beta_t**tm1_) / (1-beta_t**t_)
   else:
     beta_t_ = tm1_/t_
   a_t = state_ops.scatter_update(a_tm1, idxs, beta_t_*a_tm1_, use_locking=self._use_locking)
   a_t = state_ops.scatter_add(a_t, idxs, (1-beta_t)*b_t_, use_locking=self._use_locking)
   return a_t, t
Exemple #3
0
    def _apply_sparse(self, grad, var):
        lr = (self._lr_t * math_ops.sqrt(1 - self._beta2_power) /
              (1 - self._beta1_power))
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad.values * (1 - self._beta1_t)
        m_t = state_ops.assign(m,
                               m * self._beta1_t,
                               use_locking=self._use_locking)

        m_t = state_ops.scatter_add(m_t,
                                    grad.indices,
                                    m_scaled_g_values,
                                    use_locking=self._use_locking)
        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t)
        v_t = state_ops.assign(v,
                               v * self._beta2_t,
                               use_locking=self._use_locking)
        v_t = state_ops.scatter_add(v_t,
                                    grad.indices,
                                    v_scaled_g_values,
                                    use_locking=self._use_locking)
        v_sqrt = tf.pow(v_t, self._pow_t)
        var_update = state_ops.assign_sub(var,
                                          lr * m_t /
                                          (v_sqrt + self._epsilon_t),
                                          use_locking=self._use_locking)
        # regularization
        var_update = state_ops.assign_sub(var_update,
                                          self._sparse_regularization * var,
                                          use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #4
0
  def _apply_sparse_shared(self, grad, var, indices, scatter_add):
    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
    # v_t = beta2 * v + beta2 * (1 - beta2) * ((g_t - m) * (g_t - m))
    v = self.get_slot(var, "v")
    m = self.get_slot(var, "m")
    v_scaled_g_values = beta2_t * (1 - beta2_t)
    m_t1 = state_ops.assign(m, -m, use_locking=self._use_locking)
    with ops.control_dependencies([m_t1]):
      m_t1 = state_ops.scatter_add(m_t1, indices, grad)
    m_t1 = state_ops.assign(m_t1, m_t1 * m_t1, use_locking=self._use_locking)
    m_t1 = state_ops.assign(m_t1, v_scaled_g_values * m_t1 , use_locking=self._use_locking)
    v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
    v_t = state_ops.assign_add(v_t, m_t1, use_locking=self._use_locking)

    # m_t = beta1 * m + (1 - beta1) * (g_t - m)
    #     = (2 * beta1 - 1) * m + (1 - beta1) * g_t
    m_scaled_g_values = grad * (1 - beta1_t)
    m_t = state_ops.assign(m, m * ( 2 * beta1_t - 1),
                           use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = state_ops.scatter_add(m_t, indices, m_scaled_g_values)

    v_sqrt = math_ops.sqrt(v_t)
    var_update = state_ops.assign_sub(var,
                                      lr * m_t / (v_sqrt + epsilon_t),
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #5
0
 def _apply_sparse(self, grad, var):
     beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
     beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
     lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
     beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
     beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
     epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
     lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
     # m_t = beta1 * m + (1 - beta1) * g_t
     m = self.get_slot(var, "m")
     m_scaled_g_values = grad.values * (1 - beta1_t)
     m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
     m_t = state_ops.scatter_add(m_t,
                                 grad.indices,
                                 m_scaled_g_values,
                                 use_locking=self._use_locking)
     # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
     v = self.get_slot(var, "v")
     v_scaled_g_values = (grad.values * grad.values) * (1 - beta2_t)
     v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
     v_t = state_ops.scatter_add(v_t,
                                 grad.indices,
                                 v_scaled_g_values,
                                 use_locking=self._use_locking)
     v_sqrt = math_ops.sqrt(v_t)
     var_update = state_ops.assign_sub(var,
                                       lr * m_t / (v_sqrt + epsilon_t),
                                       use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #6
0
 def _sparse_moving_average(self, x_tm1, idxs, b_t_, name, beta=.9):
   """
   Creates a moving average for a sparse variable.
   Inputs:
     x_tm1: the associated parameter (e.g. a weight matrix)
     idxs: the tensor representing the indices used
     b_t_: the value to accumulate (e.g. slices of the gradient)
     name: a string to use to retrieve it later (e.g. 'm')
     beta: the decay factor (defaults to .9)
   Outputs:
     a_t: the average after moving (same shape as x_tm1, not b_t_)
     t: the internal timestep (used to correct initialization bias)
   """
   
   a_tm1 = self._zeros_slot(x_tm1, '%s' % name, self._name)
   a_tm1_ = array_ops.gather(a_tm1, idxs)
   tm1 = self._zeros_idx_slot(x_tm1, '%s/tm1' % name, self._name)
   tm1_ = array_ops.gather(tm1, idxs)
   t = state_ops.scatter_add(tm1, idxs, tm1_*0+1, use_locking=self._use_locking)
   t_ = array_ops.gather(t, idxs)
   if beta < 1:
     beta_t = ops.convert_to_tensor(beta, name='%s/decay' % name)
     beta_t_ = beta_t * (1-beta_t**tm1_) / (1-beta_t**t_)
   else:
     beta_t_ = tm1_/t_
   a_t = state_ops.scatter_update(a_tm1, idxs, beta_t_*a_tm1_, use_locking=self._use_locking)
   a_t = state_ops.scatter_add(a_t, idxs, (1-beta_t)*b_t_, use_locking=self._use_locking)
   return a_t, t
  def _get_coordinatewise_learning_rate(self, grad, var):
    # Compute the learning rate using a moving average for the diagonal of BB^T
    avg_first = self.get_slot(var, 'first_moment')
    avg_second = self.get_slot(var, 'second_moment')
    decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
    batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)

    # Create an estimator for the moving average of gradient mean and variance
    # via Welford's algorithm
    if isinstance(grad, ops.Tensor):
      delta = grad - avg_first
      first_moment_update = avg_first.assign_add(
          array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        second_moment_update = avg_second.assign_add(
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                avg_second - decay_tensor  * math_ops.square(delta)))
      diag_preconditioner = control_flow_ops.with_dependencies(
          [second_moment_update],
          clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
    elif isinstance(grad, ops.IndexedSlices):
      delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
      first_moment_update = state_ops.scatter_add(
          avg_first,
          grad.indices,
          array_ops.where(self._counter < 1,
                          math_ops.cast(1., var.dtype),
                          1. - decay_tensor) * delta)

      with ops.control_dependencies([first_moment_update]):
        avg_second = state_ops.scatter_add(
            avg_second,
            grad.indices,
            math_ops.cast(self._counter < 1, var.dtype) *
            -(1. - decay_tensor) * (
                array_ops.gather_nd(avg_second, grad.indices) - decay_tensor *
                math_ops.square(delta)))
        avg_second = array_ops.gather_nd(avg_second, grad.indices)
        # TODO(b/70783772)
        diag_preconditioner = clip_ops.clip_by_value(avg_second, 1e-12, 1e12)
    else:
      raise errors.InvalidArgumentError(
          None, None, 'grad must of type Tensor or IndexedSlice')

    diag_preconditioner *= batch_size

    if self._use_single_learning_rate:
      diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)

    # From Theorem 2 Corollary 1 of Mandt et al. 2017
    return 2. * batch_size / (
        math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
        diag_preconditioner)
Exemple #8
0
    def _get_coordinatewise_learning_rate(self, grad, var):
        # Compute the learning rate using a moving average for the diagonal of BB^T
        avg_first = self.get_slot(var, 'first_moment')
        avg_second = self.get_slot(var, 'second_moment')
        decay_tensor = math_ops.cast(self._decay_tensor, var.dtype)
        batch_size = math_ops.cast(self._batch_size_tensor, var.dtype)

        # Create an estimator for the moving average of gradient mean and variance
        # via Welford's algorithm
        if isinstance(grad, ops.Tensor):
            delta = grad - avg_first
            first_moment_update = avg_first.assign_add(
                array_ops.where(self._counter < 1, math_ops.cast(1, var.dtype),
                                1. - decay_tensor) * delta)

            with ops.control_dependencies([first_moment_update]):
                second_moment_update = avg_second.assign_add(
                    math_ops.cast(self._counter < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (avg_second - decay_tensor * math_ops.square(delta)))
            diag_preconditioner = control_flow_ops.with_dependencies(
                [second_moment_update],
                clip_ops.clip_by_value(avg_second, 1e-12, 1e12))
        elif isinstance(grad, ops.IndexedSlices):
            delta = grad.values - array_ops.gather_nd(avg_first, grad.indices)
            first_moment_update = state_ops.scatter_add(
                avg_first, grad.indices,
                array_ops.where(self._counter < 1, math_ops.cast(
                    1., var.dtype), 1. - decay_tensor) * delta)

            with ops.control_dependencies([first_moment_update]):
                avg_second = state_ops.scatter_add(
                    avg_second, grad.indices,
                    math_ops.cast(self._counter < 1, var.dtype) *
                    -(1. - decay_tensor) *
                    (array_ops.gather_nd(avg_second, grad.indices) -
                     decay_tensor * math_ops.square(delta)))
                avg_second = array_ops.gather_nd(avg_second, grad.indices)
                # TODO (b/70783772) id:488 gh:489
                diag_preconditioner = clip_ops.clip_by_value(
                    avg_second, 1e-12, 1e12)
        else:
            raise errors.InvalidArgumentError(
                None, None, 'grad must of type Tensor or IndexedSlice')

        diag_preconditioner *= batch_size

        if self._use_single_learning_rate:
            diag_preconditioner = math_ops.reduce_mean(diag_preconditioner)

        # From Theorem 2 Corollary 1 of Mandt et al. 2017
        return 2. * batch_size / (
            math_ops.cast(self._total_num_examples, var.dtype.base_dtype) *
            diag_preconditioner)
Exemple #9
0
 def _apply_sparse(self, grad, var):
     # ms_t = decay * ms + (1 - decay) * (g_t * g_t)
     ms = self.get_slot(var, "rms")  # should not be named rms when it's ms
     print('---SPARSE TIME---')
     print('lr: ' + str(self._learning_rate_tensor.get_shape()))
     print('decay: ' + str(self._decay_tensor.get_shape()))
     print('momentum: ' + str(self._momentum_tensor.get_shape()))
     print('epsilon: ' + str(self._epsilon_tensor.get_shape()))
     print('ms: ' + str(ms.get_shape()))
     print('grad.values: ' + str(grad.values.get_shape()))
     ms_scaled_g_values = (grad.values * grad.values) * \
                          (1 - self._decay_tensor)
     print('ms_scaled_g_values:' + str(ms_scaled_g_values.get_shape()))
     # no clue what these ops does
     ms_t = state_ops.assign(ms,
                             ms * self._decay_tensor,
                             use_locking=self._use_locking)
     print('ms_t: ' + str(ms_t.get_shape()))
     ms_t = state_ops.scatter_add(ms_t,
                                  grad.indices,
                                  ms_scaled_g_values,
                                  use_locking=self._use_locking)
     print('ms_t: ' + str(ms_t.get_shape()))
     rms = math_ops.sqrt(ms_t)
     print('rms: ' + str(rms.get_shape()))
     rms += self._epsilon_tensor
     print('rms: ' + str(rms.get_shape()))
     mom = self.get_slot(var, "momentum")
     print('mom: ' + str(mom.get_shape()))
     sparse_grad = self.get_slot(var, "sparse_grad")
     sparse_grad_t = state_ops.assign(sparse_grad,
                                      sparse_grad,
                                      use_locking=self._use_locking)
     sparse_grad_t = state_ops.scatter_add(sparse_grad,
                                           grad.indices,
                                           grad.values *
                                           self._learning_rate,
                                           use_locking=self._use_locking)
     mom_scaled_g_values = sparse_grad_t / rms
     print('mom_scaled_g_values: ' + str(mom.get_shape()))
     mom_t = state_ops.assign(mom,
                              mom * self._momentum_tensor,
                              use_locking=self._use_locking)
     print('mom_t: ' + str(mom_t.get_shape()))
     mom_t += mom_scaled_g_values
     #    mom_t = state_ops.scatter_add(mom_t, grad.indices, mom_scaled_g_values,
     #                                  use_locking=self._use_locking)
     print('mom_t: ' + str(mom_t.get_shape()))
     var_update = state_ops.assign_sub(var,
                                       mom_t,
                                       use_locking=self._use_locking)
     return control_flow_ops.group(*[var_update, ms_t, mom_t])
  def testWrongShape(self):
    # Indices and values mismatch.
    var = variables.Variable(
        array_ops.zeros(shape=[1024, 64, 64], dtype=dtypes.float32))
    indices = array_ops.placeholder(dtypes.int32, shape=[32])
    values = array_ops.placeholder(dtypes.float32, shape=[33, 64, 64])
    with self.assertRaises(ValueError):
      state_ops.scatter_add(var, indices, values)

    # Var and values mismatch.
    values = array_ops.placeholder(dtypes.float32, shape=[32, 64, 63])
    with self.assertRaises(ValueError):
      state_ops.scatter_add(var, indices, values)
  def testWrongShape(self):
    # Indices and values mismatch.
    var = variables.Variable(
        array_ops.zeros(shape=[1024, 64, 64], dtype=dtypes.float32))
    indices = array_ops.placeholder(dtypes.int32, shape=[32])
    values = array_ops.placeholder(dtypes.float32, shape=[33, 64, 64])
    with self.assertRaises(ValueError):
      state_ops.scatter_add(var, indices, values)

    # Var and values mismatch.
    values = array_ops.placeholder(dtypes.float32, shape=[32, 64, 63])
    with self.assertRaises(ValueError):
      state_ops.scatter_add(var, indices, values)
Exemple #12
0
 def _apply_sparse(self, grad, var):
   if len(grad.indices.get_shape()) == 1:
     grad_indices = grad.indices
     grad_values = grad.values
   else:
     grad_indices = array_ops.reshape(grad.indices, [-1])
     grad_values = array_ops.reshape(grad.values, [-1, grad.values.get_shape()[-1].value])
   gidxs, metagidxs = array_ops.unique(grad_indices)
   sizegidxs = array_ops.size(gidxs)
   gvals = math_ops.unsorted_segment_sum(grad_values, metagidxs, sizegidxs)
   # m_t = mu * m + (1 - mu) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = gvals * (1 - self._mu_t)
   m_t = state_ops.scatter_update(m, gidxs,
                                  array_ops.gather(m, gidxs) * self._mu_t,
                                  use_locking=self._use_locking)
   m_t = state_ops.scatter_add(m_t, gidxs, m_scaled_g_values,
                               use_locking=self._use_locking)
   m_t_ = array_ops.gather(m_t, gidxs) / (1 - self._mu2_t * self._mu_power)
   # m_bar = mu * m_t + (1 - mu) * g_t
   m_bar = self._mu2_t * m_t_ + m_scaled_g_values / (1 - self._mu_power)
   var_update = state_ops.scatter_sub(var, gidxs,
                                    self._lr_t * m_bar,
                                    use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t])
Exemple #13
0
 def _apply_sparse(self, grad, var):
   return self._apply_sparse_shared(
       grad.values, var, grad.indices,
       lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
           x, i, v, use_locking=self._use_locking),
       lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
           x, i, v, use_locking=self._use_locking))
Exemple #14
0
 def _apply_sparse(self, grad, var):
   return self._apply_sparse_shared(
       grad.values, var, grad.indices,
       lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
           x, i, v, use_locking=self._use_locking),
       lambda x, i, v: state_ops.scatter_update(  # pylint: disable=g-long-lambda
           x, i, v, use_locking=self._use_locking))
Exemple #15
0
    def _apply_sparse(self, grad, var):
        """Add ops to apply sparse gradients to `var`.

        The IndexedSlices object passed to `grad` in this function is by default
        pre-processed in `_apply_sparse_duplicate_indices` to remove duplicate
        indices (see its docstring for details). Optimizers which can tolerate or
        have correct special cases for duplicate sparse indices may override
        `_apply_sparse_duplicate_indices` instead of this function, avoiding that
        overhead.

        Args:
          grad: `IndexedSlices`, with no repeated indices.
          var: A `Variable` object.

        Returns:
          An `Operation`.
        """
        return self._apply_sparse_shared(
            grad.values,
            var,
            grad.indices,
            lambda x, i, v: state_ops.scatter_add(  # pylint: disable=g-long-lambda
                x,
                i,
                v,
                use_locking=self._use_locking))
Exemple #16
0
  def _apply_sparse(self, grad, var):
    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad.values * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t,
                           use_locking=self._use_locking)
    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)



    # u_t = max(beta_2 * u_{t-1}, L1(g_t))
    # theta_t = theta_{t-1} - alpha/(1-beta_1).m_t/u_t

    v = self.get_slot(var, "v")
    g_abs_values = tensorflow.abs(g_t)
    v_t = state_ops.assign(v, v * beta_2, use_locking = self._use_locking)
    v_t = state_ops.assign_max(v_t, grad.indices, g_abs_values,
                             use_locking=self._use_locking)
    var_update = state_ops.assign_sub(var,
                                      lr*m_t/(v_t*(1 - beta_1)),
                                      use_locking=self._use_locking)

    return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #17
0
    def _apply_sparse(self, grad, var):
        lr = self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad.values * (1 - self._beta1_t)
        m_t = state_ops.assign(m, m * self._beta1_t, use_locking=self._use_locking)

        m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values, use_locking=self._use_locking)
        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t)
        v_t = state_ops.assign(v, v * self._beta2_t, use_locking=self._use_locking)
        v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values, use_locking=self._use_locking)
        v_sqrt = tf.pow(v_t, self._pow_t)
        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking)
        # regularization
        var_update = state_ops.assign_sub(var_update, self._sparse_regularization * var, use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t])
 def _apply_sparse(self, grad, var):
     grad_t = self.get_slot(var, "gg")
     grad_t = state_ops.assign_sub(grad_t,
                                   tf.zeros_like(grad_t),
                                   use_locking=self._use_locking)
     grad_t = state_ops.scatter_add(grad_t,
                                    grad.indices,
                                    grad.values,
                                    use_locking=self._use_locking)
     return self._apply_dense(grad_t, var)
Exemple #19
0
  def _apply_sparse(self, grad, var):
    # ms_t = decay * ms + (1 - decay) * (g_t * g_t)
    ms = self.get_slot(var, "rms") # should not be named rms when it's ms
    print('---SPARSE TIME---')
    print('lr: ' + str(self._learning_rate_tensor.get_shape()))
    print('decay: ' + str(self._decay_tensor.get_shape()))
    print('momentum: ' + str(self._momentum_tensor.get_shape()))
    print('epsilon: ' + str(self._epsilon_tensor.get_shape()))
    print('ms: ' + str(ms.get_shape()))
    print('grad.values: ' + str(grad.values.get_shape()))
    ms_scaled_g_values = (grad.values * grad.values) * \
                         (1 - self._decay_tensor)
    print('ms_scaled_g_values:' + str(ms_scaled_g_values.get_shape()))
    # no clue what these ops does
    ms_t = state_ops.assign(ms, ms * self._decay_tensor,
                            use_locking=self._use_locking)
    print('ms_t: ' + str(ms_t.get_shape()))
    ms_t = state_ops.scatter_add(ms_t, grad.indices, ms_scaled_g_values,
                                 use_locking=self._use_locking)
    print('ms_t: ' + str(ms_t.get_shape()))
    rms = math_ops.sqrt(ms_t)
    print('rms: ' + str(rms.get_shape()))
    rms += self._epsilon_tensor
    print('rms: ' + str(rms.get_shape()))
    mom = self.get_slot(var, "momentum")
    print('mom: ' + str(mom.get_shape()))
    sparse_grad = self.get_slot(var, "sparse_grad")
    sparse_grad_t = state_ops.assign(sparse_grad, sparse_grad, use_locking=self._use_locking)
    sparse_grad_t = state_ops.scatter_add(sparse_grad, grad.indices, grad.values*self._learning_rate, use_locking=self._use_locking)
    mom_scaled_g_values = sparse_grad_t / rms
    print('mom_scaled_g_values: ' + str(mom.get_shape()))
    mom_t = state_ops.assign(mom, mom * self._momentum_tensor,
                             use_locking=self._use_locking)
    print('mom_t: ' + str(mom_t.get_shape()))
    mom_t += mom_scaled_g_values
#    mom_t = state_ops.scatter_add(mom_t, grad.indices, mom_scaled_g_values,
#                                  use_locking=self._use_locking)
    print('mom_t: ' + str(mom_t.get_shape()))
    var_update = state_ops.assign_sub(var, mom_t,
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, ms_t, mom_t])
 def _apply_sparse(self, grad, var):
     # return self._apply_sparse_shared(
     #     grad.values, var, grad.indices,
     #     lambda x, i, v: state_ops.scatter_add(x, i, v, use_locking=self._use_locking))]
     grad_t = self.get_slot(var, "g%d" % self._keep_num)
     grad_t = state_ops.assign_sub(grad_t,
                                   tf.zeros_like(grad_t),
                                   use_locking=self._use_locking)
     grad_t = state_ops.scatter_add(grad_t,
                                    grad.indices,
                                    grad.values,
                                    use_locking=self._use_locking)
     return self._apply_dense(grad_t, var)
Exemple #21
0
    def testAdd(self):
        variable = variables.Variable(array_ops.ones([8], dtype=dtypes.int32))
        resource_variable = resource_variable_ops.ResourceVariable(
            array_ops.ones([8], dtype=dtypes.int32))
        indices = constant_op.constant([4, 3, 1, 7])
        updates = constant_op.constant([0, 2, -1, 3], dtype=dtypes.int32)

        for ref in (variable, resource_variable):
            add_result = state_ops.scatter_add(ref, indices, updates)
            self.evaluate(ref.initializer)

            expected_result = constant_op.constant([1, 0, 1, 3, 1, 1, 1, 4])
            self.assertAllEqual(self.evaluate(add_result), expected_result)
            self.assertAllEqual(self.evaluate(ref), expected_result)
Exemple #22
0
    def _apply_sparse(self, grad, var):
        gamma_power = math_ops.cast(self._gamma_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        gamma_t = math_ops.cast(self._gamma_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
        lr = lr_t * math_ops.sqrt(1 - gamma_power)
        # v_t = gamma * v + gamma * (1 - gamma) * ((g_t - m) * (g_t - m))
        v = self.get_slot(var, "v")
        m = self.get_slot(var, "m")
        v_scaled_g_values = gamma_t * (1 - gamma_t)
        m_t1 = state_ops.assign(m, -m, use_locking=self._use_locking)
        m_t1 = state_ops.scatter_add(m_t1,
                                     grad.indices,
                                     grad.values,
                                     use_locking=self._use_locking)
        m_t1 = state_ops.assign(m_t1,
                                m_t1 * m_t1,
                                use_locking=self._use_locking)
        m_t1 = state_ops.assign(m_t1,
                                v_scaled_g_values * m_t1,
                                use_locking=self._use_locking)
        v_t = state_ops.assign(v, v * gamma_t, use_locking=self._use_locking)
        v_t = state_ops.assign_add(v_t, m_t1, use_locking=self._use_locking)

        # m_t = gamma * m + (1 - gamma) * g_t
        m_scaled_g_values = grad.values * (1 - gamma_t)
        m_t = state_ops.assign(m, m * gamma_t, use_locking=self._use_locking)
        m_t = state_ops.scatter_add(m_t,
                                    grad.indices,
                                    m_scaled_g_values,
                                    use_locking=self._use_locking)

        v_sqrt = math_ops.sqrt(v_t)
        var_update = state_ops.assign_sub(var,
                                          lr * m_t / (v_sqrt + epsilon_t),
                                          use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #23
0
  def _apply_sparse(self, grad, var):
    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    


    lr = (lr_t * (1 - beta2_power) / (1 - beta1_power))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad.values * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t,
                           use_locking=self._use_locking)
    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)
    # gn_c = ((d/dy) * dLdy) * dydvar ** 2
    # gn_t = beta2 * gn + (1 - beta2) * (gn_c)
    dLdy = tf.gradients(self._loss_t, self._pred_t)
    sec_loss = tf.gradients(dLdy, self._pred_t)
    sec_loss_t = math_ops.cast(self, sec_loss, var.dtype.base_dtype)
    dydvar = tf.gradients(self.pred_t, var)
    dydvar_t = math_ops.cast(self, dydvar, var.dtype.base_dtype)
    gn_c = sec_loss_t * dydvar_t * dydvar_t
    gn = self.get_slot(var, "gn")
    gn_scaled_g_values = (gn_c) * (1 - beta2_t)
    gn_t = state_ops.assign(gn, gn * beta2_t, use_locking=self._use_locking)
    gn_t = state_ops.scatter_add(gn_t, grad.indices, gn_scaled_g_values,
                               use_locking=self._use_locking)
    var_update = state_ops.assign_sub(var,
                                      lr * m_t / (gn_t + epsilon_t),
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])
 def _get_partitioned_update_ops(self, v_num, num_partitions_by_var,
                                 p_assignments_by_var, gather_ids_by_var,
                                 weights, full_update, p_assignments,
                                 num_partitions):
     """Get updates for partitioned variables."""
     num_partitions = num_partitions_by_var[v_num]
     p_assignments = p_assignments_by_var[v_num]
     gather_ids = gather_ids_by_var[v_num]
     updates = data_flow_ops.dynamic_partition(full_update, p_assignments,
                                               num_partitions)
     update_ops = []
     for p in range(num_partitions):
         with ops.colocate_with(weights[p]):
             result = state_ops.scatter_add(weights[p], gather_ids[p],
                                            updates[p])
         update_ops.append(result)
     return update_ops
Exemple #25
0
 def _get_partitioned_update_ops(self,
                                 v_num,
                                 num_partitions_by_var,
                                 p_assignments_by_var,
                                 gather_ids_by_var,
                                 weights,
                                 full_update,
                                 p_assignments,
                                 num_partitions):
   """Get updates for partitioned variables."""
   num_partitions = num_partitions_by_var[v_num]
   p_assignments = p_assignments_by_var[v_num]
   gather_ids = gather_ids_by_var[v_num]
   updates = data_flow_ops.dynamic_partition(
       full_update, p_assignments, num_partitions)
   update_ops = []
   for p in range(num_partitions):
     with ops.colocate_with(weights[p]):
       result = state_ops.scatter_add(weights[p], gather_ids[p], updates[p])
     update_ops.append(result)
   return update_ops
Exemple #26
0
    def _apply_sparse(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        lamb_t = math_ops.cast(self._lamb_t, var.dtype.base_dtype)

        vzero = self.get_slot(var, "vzero")
        preG = self.get_slot(var, "preG")
        wzero = self.get_slot(var, "wzero")

        v_n_s = self.get_slot(var, "temp")

        # v_n_s = grad - preG + vzero
        temp = state_ops.assign(v_n_s, grad.values)
        with ops.control_dependencies([temp]):
            vns_update = state_ops.scatter_add(
                v_n_s, grad.indices, vzero - preG)
        with ops.control_dependencies([vns_update]):
            v_update = state_ops.assign(vzero, temp)
        v_t = var - lr_t * temp
        #prox = tf_utils.prox_L2(var - lr_t * v_n_s, lamb_t)
        prox = tf_utils.prox_L2(v_t, wzero, lr_t, lamb_t)
        var_update = state_ops.assign(var, prox)

        return control_flow_ops.group(*[var_update, v_update, ])
    def _apply_sparse_shared(self, grad, var, indices):

        shadow = self.get_slot(var, "shadow_{0}".format(self.worker_index))
        # if shadow is None:
        #   raise ValueError("None shadow with index = " + str(self.worker_index) + " and var = " + str(var))
        lambda_ = math_ops.cast(self._lambda_tensor, var.dtype.base_dtype)
        lr = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)

        var_slice = array_ops.gather(var, indices)
        shadow_slice = array_ops.gather(shadow, indices)

        var_scaled_g_values = lr * (grad + lambda_ * grad * grad *
                                    (var_slice - shadow_slice))

        var_t = state_ops.scatter_add(var,
                                      indices,
                                      -var_scaled_g_values,
                                      use_locking=self._use_locking)

        with ops.control_dependencies([var_t]):
            shadow_t = state_ops.assign(shadow, var_t)

        return control_flow_ops.group(*[var_t, shadow_t])
  def _apply_gradient(self, grad, var, indices=None):
    """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
    global_step = self._global_step + 1

    # Update accumulated weighted average of gradients
    gbar = self.get_slot(var, "gbar")
    gbar_decay_t = GetParam(self._gbar_decay, global_step)
    gbar_weight_t = GetParam(self._gbar_weight, global_step)
    if indices is not None:
      # Note - the sparse update is not easily implemented, since the
      # algorithm needs all indices of gbar to be updated
      # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
      # One way to make mat_gbar_decay = 1 is by rescaling.
      # If we want the update:
      #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
      # define:
      #         r_{t+1} = a_{t+1} * r_t
      #         h_t = G_t / r_t
      # Then:
      #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
      # So we get the mat_gbar_decay = 1 as desired.
      # We can implement this in a future version as needed.
      # However we still need gbar_decay = 0, otherwise all indices
      # of the variable will need to be updated.
      if self._gbar_decay != 0.0:
        tf_logging.warning("Not applying momentum for variable: %s" % var.name)
      gbar_updated = grad
    else:
      gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                            gbar_decay_t,
                                            gbar_weight_t * grad)

    # Update the preconditioners and compute the preconditioned gradient
    shape = var.get_shape()
    mat_g_list = []
    for i in range(len(shape)):
      mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
    mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
    mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

    preconditioned_grad = gbar_updated
    v_rank = len(mat_g_list)
    neg_alpha = - GetParam(self._alpha, global_step) / v_rank
    svd_interval = GetParam(self._svd_interval, global_step)
    precond_update_interval = GetParam(self._precond_update_interval,
                                       global_step)
    for i, mat_g in enumerate(mat_g_list):
      # axes is the list of indices to reduce - everything but the current i.
      axes = list(range(i)) + list(range(i+1, v_rank))
      if shape[i] <= self._max_matrix_size:
        # If the tensor size is sufficiently small perform full Shampoo update
        # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
        # is not strictly correct. However we will use it for now, and
        # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

        # pylint: disable=g-long-lambda,cell-var-from-loop
        mat_g_updated = control_flow_ops.cond(
            math_ops.mod(global_step, precond_update_interval) < 1,
            lambda: self._update_mat_g(
                mat_g, grad, axes, mat_gbar_decay_t,
                mat_gbar_weight_t * precond_update_interval, i),
            lambda: mat_g)

        mat_g_updated = mat_g_updated / float(shape[i].value)

        if self._svd_interval == 1:
          mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha)
        else:
          mat_h = control_flow_ops.cond(
              math_ops.mod(global_step, svd_interval) < 1,
              lambda: self._compute_power(var, mat_g_updated, shape[i],
                                          neg_alpha, "H_" + str(i)),
              lambda: self.get_slot(var, "H_" + str(i)))

        # mat_h is a square matrix of size d_i x d_i
        # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
        # After contraction with a d_i x d_i tensor
        # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
        # (the first dimension is contracted out, and the second dimension of
        # mat_h is appended).  After going through all the indices, it becomes
        # a d_0 x ... x d_n tensor again.
        preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h,
                                                 axes=([0], [0]),
                                                 name="precond_" + str(i))
      else:
        # Tensor size is too large -- perform diagonal Shampoo update
        # Only normalize non-vector cases.
        if axes:
          normalizer = 1.0 if indices is not None else float(shape[i].value)
          grad_outer = math_ops.reduce_sum(grad * grad, axis=axes) / normalizer
        else:
          grad_outer = grad * grad

        if i == 0 and indices is not None:
          assert self._mat_gbar_decay == 1.0
          mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                mat_gbar_weight_t * grad_outer)
          mat_g_updated_slice = array_ops.gather(mat_g_updated, indices)
          mat_h = array_ops.where(
              math_ops.greater(mat_g_updated_slice, 0),
              math_ops.pow(mat_g_updated_slice, neg_alpha),
              array_ops.zeros_like(mat_g_updated_slice))
        else:
          mat_g_updated = self._weighted_average(mat_g,
                                                 self._mat_gbar_decay,
                                                 mat_gbar_decay_t,
                                                 mat_gbar_weight_t * grad_outer)
          mat_h = array_ops.where(
              math_ops.greater(mat_g_updated, 0),
              math_ops.pow(mat_g_updated, neg_alpha),
              array_ops.zeros_like(mat_g_updated))

        # Need to do the transpose to ensure that the tensor becomes
        # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
        preconditioned_grad = array_ops.transpose(
            preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h

    # Update the variable based on the Shampoo update
    learning_rate_t = GetParam(self._learning_rate, global_step)
    if indices is not None:
      var_updated = state_ops.scatter_add(
          var, indices, -learning_rate_t * preconditioned_grad)
    else:
      var_updated = state_ops.assign_sub(var,
                                         learning_rate_t * preconditioned_grad)
    return var_updated
 def testScatterAddStateOps(self):
   with context.eager_mode():
     v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
     state_ops.scatter_add(v, [1], [3])
     self.assertAllEqual([1.0, 5.0], v.numpy())
Exemple #30
0
def histogram_fixed_width(hist,
                          new_values,
                          value_range,
                          use_locking=False,
                          name='histogram_fixed_width'):
  """Update histogram Variable with new values.

  This Op fills histogram with counts of values falling within fixed-width,
  half-open bins.

  Args:
    hist:  1-D mutable `Tensor`, e.g. a `Variable`.
    new_values:  Numeric `Tensor`.
    value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
      mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
      Must be same dtype as new_values.
    use_locking:  Boolean.
      If `True`, use locking during the operation (optional).
    name:  A name for this operation (optional).

  Returns:
    An op that updates `hist` with `new_values` when evaluated.

  Examples:
  ```python
  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
  nbins = 5
  value_range = [0.0, 5.0]
  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]

  with tf.default_session() as sess:
    hist = variables.Variable(array_ops.zeros(nbins, dtype=tf.int32))
    hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
                                                      value_range)
    variables.initialize_all_variables().run()
    sess.run(hist_update) => [2, 1, 1, 0, 2]
  ```
  """
  with ops.op_scope([hist, new_values, value_range], name) as scope:
    new_values = ops.convert_to_tensor(new_values, name='new_values')
    new_values = array_ops.reshape(new_values, [-1])
    value_range = ops.convert_to_tensor(value_range, name='value_range')
    dtype = hist.dtype

    # Map tensor values that fall within value_range to [0, 1].
    scaled_values = math_ops.truediv(new_values - value_range[0],
                                     value_range[1] - value_range[0],
                                     name='scaled_values')
    nbins = math_ops.cast(hist.get_shape()[0], scaled_values.dtype)

    # map tensor values within the open interval value_range to {0,.., nbins-1},
    # values outside the open interval will be zero or less, or nbins or more.
    indices = math_ops.floor(nbins * scaled_values, name='indices')

    # Clip edge cases (e.g. value = value_range[1]) or "outliers."
    indices = math_ops.cast(
        clip_ops.clip_by_value(indices, 0, nbins - 1), dtypes.int32)

    # Dummy vector to scatter.
    # TODO(langmore) Replace non-ideal creation of large dummy vector once an
    # alternative to scatter is available.
    updates = array_ops.ones([indices.get_shape()[0]], dtype=dtype)
    return state_ops.scatter_add(hist,
                                 indices,
                                 updates,
                                 use_locking=use_locking,
                                 name=scope)
Exemple #31
0
def histogram_fixed_width(hist,
                          new_values,
                          value_range,
                          use_locking=False,
                          name='histogram_fixed_width'):
    """Update histogram Variable with new values.

    This Op fills histogram with counts of values falling within fixed-width,
    half-open bins.

    Args:
      hist:  1-D mutable `Tensor`, e.g. a `Variable`.
      new_values:  Numeric `Tensor`.
      value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
        mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
        Must be same dtype as new_values.
      use_locking:  Boolean.
        If `True`, use locking during the operation (optional).
      name:  A name for this operation (optional).

    Returns:
      An op that updates `hist` with `new_values` when evaluated.

    Examples:
    ```python
    # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
    nbins = 5
    value_range = [0.0, 5.0]
    new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]

    with tf.default_session() as sess:
      hist = variables.Variable(array_ops.zeros(nbins, dtype=tf.int32))
      hist_update = histogram_ops.histogram_fixed_width(hist, new_values,
                                                        value_range)
      variables.initialize_all_variables().run()
      sess.run(hist_update) => [2, 1, 1, 0, 2]
    ```
    """
    with ops.op_scope([hist, new_values, value_range], name) as scope:
        new_values = ops.convert_to_tensor(new_values, name='new_values')
        new_values = array_ops.reshape(new_values, [-1])
        value_range = ops.convert_to_tensor(value_range, name='value_range')
        dtype = hist.dtype

        # Map tensor values that fall within value_range to [0, 1].
        scaled_values = math_ops.truediv(new_values - value_range[0],
                                         value_range[1] - value_range[0],
                                         name='scaled_values')
        nbins = math_ops.cast(hist.get_shape()[0], scaled_values.dtype)

        # map tensor values within the open interval value_range to {0,.., nbins-1},
        # values outside the open interval will be zero or less, or nbins or more.
        indices = math_ops.floor(nbins * scaled_values, name='indices')

        # Clip edge cases (e.g. value = value_range[1]) or "outliers."
        indices = math_ops.cast(clip_ops.clip_by_value(indices, 0, nbins - 1),
                                dtypes.int32)

        # Dummy vector to scatter.
        # TODO(langmore) Replace non-ideal creation of large dummy vector once an
        # alternative to scatter is available.
        updates = array_ops.ones([indices.get_shape()[0]], dtype=dtype)
        return state_ops.scatter_add(hist,
                                     indices,
                                     updates,
                                     use_locking=use_locking,
                                     name=scope)
Exemple #32
0
    def _mini_batch_training_op(self, inputs, cluster_idx_list,
                                cluster_centers, total_counts):
        """Creates an op for training for mini batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor Ref of cluster centers.
      total_counts: Tensor Ref of cluster counts.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
        update_ops = []
        for inp, cluster_idx in zip(inputs, cluster_idx_list):
            with ops.colocate_with(inp):
                assert total_counts is not None
                cluster_idx = array_ops.reshape(cluster_idx, [-1])
                # Dedupe the unique ids of cluster_centers being updated so that updates
                # can be locally aggregated.
                unique_ids, unique_idx = array_ops.unique(cluster_idx)
                num_unique_cluster_idx = array_ops.size(unique_ids)
                # Fetch the old values of counts and cluster_centers.
                with ops.colocate_with(total_counts, ignore_existing=True):
                    old_counts = array_ops.gather(total_counts, unique_ids)
                # TODO(agarwal): This colocation seems to run into problems. Fix it.
                with ops.colocate_with(cluster_centers, ignore_existing=True):
                    old_cluster_centers = array_ops.gather(
                        cluster_centers, unique_ids)
                # Locally aggregate the increment to counts.
                count_updates = math_ops.unsorted_segment_sum(
                    array_ops.ones_like(unique_idx, dtype=total_counts.dtype),
                    unique_idx, num_unique_cluster_idx)
                # Locally compute the sum of inputs mapped to each id.
                # For a cluster with old cluster value x, old count n, and with data
                # d_1,...d_k newly assigned to it, we recompute the new value as
                # x += (sum_i(d_i) - k * x) / (n + k).
                # Compute sum_i(d_i), see comment above.
                cluster_center_updates = math_ops.unsorted_segment_sum(
                    inp, unique_idx, num_unique_cluster_idx)
                # Shape to enable broadcasting count_updates and learning_rate to inp.
                # It extends the shape with 1's to match the rank of inp.
                broadcast_shape = array_ops.concat([
                    array_ops.reshape(num_unique_cluster_idx, [1]),
                    array_ops.ones(array_ops.reshape(
                        array_ops.rank(inp) - 1, [1]),
                                   dtype=dtypes.int32)
                ], 0)
                # Subtract k * x, see comment above.
                cluster_center_updates -= math_ops.cast(
                    array_ops.reshape(count_updates, broadcast_shape),
                    inp.dtype) * old_cluster_centers
                learning_rate = math_ops.reciprocal(
                    math_ops.cast(old_counts + count_updates, inp.dtype))
                learning_rate = array_ops.reshape(learning_rate,
                                                  broadcast_shape)
                # scale by 1 / (n + k), see comment above.
                cluster_center_updates *= learning_rate
                # Apply the updates.
            update_counts = state_ops.scatter_add(total_counts, unique_ids,
                                                  count_updates)
            update_cluster_centers = state_ops.scatter_add(
                cluster_centers, unique_ids, cluster_center_updates)
            update_ops.extend([update_counts, update_cluster_centers])
        return control_flow_ops.group(*update_ops)
Exemple #33
0
    def minimize(self, global_step=None, name=None):
        """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
        # Technically, the op depends on a lot more than the variables,
        # but we'll keep the list short.
        with name_scope(name, 'sdca/minimize'):
            sparse_example_indices = []
            sparse_feature_indices = []
            sparse_features_values = []
            for sf in self._examples['sparse_features']:
                sparse_example_indices.append(sf.example_indices)
                sparse_feature_indices.append(sf.feature_indices)
                # If feature values are missing, sdca assumes a value of 1.0f.
                if sf.feature_values is not None:
                    sparse_features_values.append(sf.feature_values)

            example_ids_hashed = sdca_fprint(
                convert_to_tensor(self._examples['example_ids']))
            example_state_data = self._hashtable.lookup(example_ids_hashed)
            # Solver returns example_state_update, new delta sparse_feature_weights
            # and delta dense_feature_weights.

            weights_tensor = self._convert_n_to_tensor(
                self._slots['unshrinked_sparse_features_weights'])
            sparse_weights = []
            sparse_indices = []
            for w, i in zip(weights_tensor, sparse_feature_indices):
                # Find the feature ids to lookup in the variables.
                with ops.device(w.device):
                    sparse_indices.append(
                        math_ops.cast(
                            array_ops.unique(math_ops.cast(i,
                                                           dtypes.int32))[0],
                            dtypes.int64))
                    sparse_weights.append(
                        array_ops.gather(w, sparse_indices[-1]))

            esu, sfw, dfw = sdca_optimizer(
                sparse_example_indices,
                sparse_feature_indices,
                sparse_features_values,
                self._convert_n_to_tensor(self._examples['dense_features']),
                convert_to_tensor(self._examples['example_weights']),
                convert_to_tensor(self._examples['example_labels']),
                sparse_indices,
                sparse_weights,
                self._convert_n_to_tensor(
                    self._slots['unshrinked_dense_features_weights']),
                example_state_data,
                loss_type=self._options['loss_type'],
                l1=self._options['symmetric_l1_regularization'],
                l2=self._symmetric_l2_regularization(),
                num_loss_partitions=self._num_loss_partitions(),
                num_inner_iterations=1)

            with ops.control_dependencies([esu]):
                update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
                # Update the weights before the proximal step.
                for w, i, u in zip(
                        self._slots['unshrinked_sparse_features_weights'],
                        sparse_indices, sfw):
                    update_ops.append(state_ops.scatter_add(w, i, u))
                for w, u in zip(
                        self._slots['unshrinked_dense_features_weights'], dfw):
                    update_ops.append(w.assign_add(u))

                with ops.control_dependencies(update_ops):
                    update_ops = []
                    # Copy over unshrinked weights to user provided variables.
                    for i, name in enumerate(
                        ['sparse_features_weights', 'dense_features_weights']):
                        for var, slot_var in zip(
                                self._variables[name],
                                self._slots['unshrinked_' + name]):
                            update_ops.append(var.assign(slot_var))

                    update_group = control_flow_ops.group(*update_ops)

                    # Apply proximal step.
                    with ops.control_dependencies([update_group]):
                        shrink_ops = []
                        for name in [
                                'sparse_features_weights',
                                'dense_features_weights'
                        ]:
                            for var in self._variables[name]:
                                with ops.device(var.device):
                                    shrink_ops.append(
                                        sdca_shrink_l1(
                                            self._convert_n_to_tensor(
                                                [var], as_ref=True),
                                            l1=self.
                                            _symmetric_l1_regularization(),
                                            l2=self.
                                            _symmetric_l2_regularization()))
                        shrink_l1 = control_flow_ops.group(*shrink_ops)
            if not global_step:
                return shrink_l1
            with ops.control_dependencies([shrink_l1]):
                return state_ops.assign_add(global_step, 1, name=name).op
 def testScatterAddStateOps(self):
   with context.eager_mode():
     v = resource_variable_ops.ResourceVariable([1.0, 2.0], name="add")
     state_ops.scatter_add(v, [1], [3])
     self.assertAllEqual([1.0, 5.0], v.numpy())
Exemple #35
0
def histogram_fixed_width(values, value_range, nbins=100, use_locking=True, dtype=dtypes.int32, name=None):
    """Return histogram of values.

  Given the tensor `values`, this operation returns a rank 1 histogram counting
  the number of entries in `values` that fell into every bin.  The bins are
  equal width and determined by the arguments `value_range` and `nbins`.

  Args:
    values:  Numeric `Tensor`.
    value_range:  Shape [2] `Tensor`.  new_values <= value_range[0] will be
      mapped to hist[0], values >= value_range[1] will be mapped to hist[-1].
      Must be same dtype as new_values.
    nbins:  Integer number of bins in this histogram.
    use_locking:  Boolean.
      If `True`, use locking during the operation (optional).
    dtype:  dtype for returned histogram.
    name:  A name for this operation (defaults to 'histogram_fixed_width').

  Returns:
    A `Variable` holding histogram of values.

  Examples:
  ```python
  # Bins will be:  (-inf, 1), [1, 2), [2, 3), [3, 4), [4, inf)
  nbins = 5
  value_range = [0.0, 5.0]
  new_values = [-1.0, 0.0, 1.5, 2.0, 5.0, 15]

  with tf.default_session() as sess:
    hist = tf.histogram_fixed_width(new_values, value_range, nbins=5)
    variables.initialize_all_variables().run()
    sess.run(hist) => [2, 1, 1, 0, 2]
  ```
  """
    with variable_scope.variable_op_scope([values, value_range], name, "histogram_fixed_width") as scope:
        values = ops.convert_to_tensor(values, name="values")
        values = array_ops.reshape(values, [-1])
        value_range = ops.convert_to_tensor(value_range, name="value_range")

        # Map tensor values that fall within value_range to [0, 1].
        scaled_values = math_ops.truediv(values - value_range[0], value_range[1] - value_range[0], name="scaled_values")

        # map tensor values within the open interval value_range to {0,.., nbins-1},
        # values outside the open interval will be zero or less, or nbins or more.
        indices = math_ops.floor(nbins * scaled_values, name="indices")

        # Clip edge cases (e.g. value = value_range[1]) or "outliers."
        indices = math_ops.cast(clip_ops.clip_by_value(indices, 0, nbins - 1), dtypes.int32)

        # Dummy vector to scatter.
        # TODO(langmore) Replace non-ideal creation of large dummy vector once an
        # alternative to scatter is available.
        updates = array_ops.ones_like(indices, dtype=dtype)

        hist = variable_scope.get_variable(
            "hist", initializer=array_ops.zeros_initializer([nbins], dtype=dtype), trainable=False
        )
        hist_assign_zero = hist.assign(array_ops.zeros_like(hist))

        with ops.control_dependencies([hist_assign_zero]):
            return state_ops.scatter_add(hist, indices, updates, use_locking=use_locking, name=scope.name)
Exemple #36
0
  def minimize(self, global_step=None, name=None):
    """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
    # Technically, the op depends on a lot more than the variables,
    # but we'll keep the list short.
    with name_scope(name, 'sdca/minimize'):
      sparse_example_indices = []
      sparse_feature_indices = []
      sparse_features_values = []
      for sf in self._examples['sparse_features']:
        sparse_example_indices.append(sf.example_indices)
        sparse_feature_indices.append(sf.feature_indices)
        # If feature values are missing, sdca assumes a value of 1.0f.
        if sf.feature_values is not None:
          sparse_features_values.append(sf.feature_values)

      example_ids_hashed = sdca_fprint(
          convert_to_tensor(self._examples['example_ids']))
      example_state_data = self._hashtable.lookup(example_ids_hashed)
      # Solver returns example_state_update, new delta sparse_feature_weights
      # and delta dense_feature_weights.

      weights_tensor = self._convert_n_to_tensor(self._slots[
          'unshrinked_sparse_features_weights'])
      sparse_weights = []
      sparse_indices = []
      for w, i in zip(weights_tensor, sparse_feature_indices):
        # Find the feature ids to lookup in the variables.
        with ops.device(w.device):
          sparse_indices.append(
              math_ops.cast(
                  array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
                  dtypes.int64))
          sparse_weights.append(array_ops.gather(w, sparse_indices[-1]))

      esu, sfw, dfw = sdca_optimizer(
          sparse_example_indices,
          sparse_feature_indices,
          sparse_features_values,
          self._convert_n_to_tensor(self._examples['dense_features']),
          convert_to_tensor(self._examples['example_weights']),
          convert_to_tensor(self._examples['example_labels']),
          sparse_indices,
          sparse_weights,
          self._convert_n_to_tensor(self._slots[
              'unshrinked_dense_features_weights']),
          example_state_data,
          loss_type=self._options['loss_type'],
          l1=self._options['symmetric_l1_regularization'],
          l2=self._symmetric_l2_regularization(),
          num_loss_partitions=self._num_loss_partitions(),
          num_inner_iterations=1)

      with ops.control_dependencies([esu]):
        update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
        # Update the weights before the proximal step.
        for w, i, u in zip(self._slots['unshrinked_sparse_features_weights'],
                           sparse_indices, sfw):
          update_ops.append(state_ops.scatter_add(w, i, u))
        for w, u in zip(self._slots['unshrinked_dense_features_weights'], dfw):
          update_ops.append(w.assign_add(u))

      if not global_step:
        return control_flow_ops.group(*update_ops)
      with ops.control_dependencies(update_ops):
        return state_ops.assign_add(global_step, 1, name=name).op
 def _apply_sparse(self, grad,
                   var):  # sparse grad (only for the shakespeare model)
     return self._apply_sparse_shared(
         grad.values, var, grad.indices,
         lambda x, i, v: state_ops.scatter_add(x, i, v))
Exemple #38
0
class GNOptimizer(optimizer.GradientDescentOptimizer):
  def __init__(self, loss_tensor, pred_tensor, learning_rate=0.001, beta1=0.9, 
  	           beta2=0.999, epsilon=1e-8, use_locking=False, name="GN"):
    self._beta1 = beta1
    self._beta2 = beta2
    self._epsilon = epsilon

    # Tensor versions of the constructor arguments, created in _prepare().
    self._lr_t = None
    self._beta1_t = None
    self._beta2_t = None
    self._epsilon_t = None

    # Tensors needed to calculate the update step
    self._loss_t = loss_tensor
    self._pred_t = pred_tensor


    # Variables to accumulate the powers of the beta parameters.
    # Created in _create_slots when we know the variables to optimize.
    self._beta1_power = None
    self._beta2_power = None

    # Created in SparseApply if needed.
    self._updated_lr = None
    super(GradientDescentOptimizer, self).__init__(learning_rate, use_locking, name)


  def _apply_sparse(self, grad, var):
  	return self._apply_dense(grad,var)
  	
  def _apply_dense(self, grad, var):
		"""Construct a new GN optimizer.

    Initialization:

    ```
    m_0 <- 0 (Initialize initial 1st moment vector)
    gn_0 <- 0 (Initialize initial 2nd moment vector)
    t <- 0 (Initialize timestep)
    ```

    This update rule will look very similiar to Adam, but uses the 2nd
    derivative of the Loss and does not use the sqrt

    ```
    t <- t + 1
    lr_t <- learning_rate * (1 - beta2^t) / (1 - beta1^t)
		
		dLdy <- tf.gradients(loss, y_hat)
		(d/dy) * dLdy <- tf.gradients(dLdy, y_hat)
		dydvar <- tf.gradients(y_hat, var)
	  
		gn_curr <- ((d/dy) * dLdy) * dydvar ** 2


    m_t <- beta1 * m_{t-1} + (1 - beta1) * g
    gn_t <- beta2 * v_{t-1} + (1 - beta2) * gn_curr
    variable <- variable - lr_t * m_t / (gn_t + epsilon)
    ```

    The default value of 1e-8 for epsilon might not be a good default in
    general. For example, when training an Inception network on ImageNet a
    current good choice is 1.0 or 0.1.

    Note that in dense implement of this algorithm, m_t, gn_t and variable will 
    update even if g is zero, but in sparse implement, m_t, v_t and variable 
    will not update in iterations g is zero.

    Args:
      learning_rate: A Tensor or a floating point value.  The learning rate.
      beta1: A float value or a constant float tensor.
        The exponential decay rate for the 1st moment estimates.
      beta2: A float value or a constant float tensor.
        The exponential decay rate for the 2nd moment estimates.
      epsilon: A small constant for numerical stability.
      use_locking: If True use locks for update operations.
      name: Optional name for the operations created when applying gradients.
        Defaults to "GN".
    """

    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self.learning_rate, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * (1 - beta2_power) / (1 - beta1_power))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad.values * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t,
                           use_locking=self._use_locking)
    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)
    # gn_c = ((d/dy) * dLdy) * dydvar ** 2
    # gn_t = beta2 * gn + (1 - beta2) * (gn_c)
    dLdy = tf.gradients(self.loss_t, self.pred_t)
		sec_loss = tf.gradients(dLdy, self.pred_t)
		dydvar = tf.gradients(self.pred_t, var)
		sec_loss_t = math_ops.cast(sec_loss, var.dtype.base_dtype)
		dydvar_t = tf.gradients(dydvar, var)
		gn_c = sec_loss * dydvar * dydvar
Exemple #39
0
 def _apply_sparse(self, grad, var):
     return self._apply_sparse_shared(
         grad.values,
         var,
         grad.indices,
         lambda x, i, v: state_ops.scatter_add(x, i, v, use_locking=self._use_locking))
Exemple #40
0
    m_t = state_ops.assign(m, m * beta1_t,
                           use_locking=self._use_locking)
    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)
    # gn_c = ((d/dy) * dLdy) * dydvar ** 2
    # gn_t = beta2 * gn + (1 - beta2) * (gn_c)
    dLdy = tf.gradients(self.loss_t, self.pred_t)
		sec_loss = tf.gradients(dLdy, self.pred_t)
		dydvar = tf.gradients(self.pred_t, var)
		sec_loss_t = math_ops.cast(sec_loss, var.dtype.base_dtype)
		dydvar_t = tf.gradients(dydvar, var)
		gn_c = sec_loss * dydvar * dydvar
    gn = self.get_slot(var, "gn")
    gn_scaled_g_values = (gn_c) * (1 - beta2_t)
    gn_t = state_ops.assign(gn, gn * beta2_t, use_locking=self._use_locking)
    gn_t = state_ops.scatter_add(gn_t, grad.indices, gn_scaled_g_values,
                               use_locking=self._use_locking)
    var_update = state_ops.assign_sub(var,
                                      lr * m_t / (gn_t + epsilon_t),
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])










    def minimize(self, global_step=None, name=None):
        """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
        # Technically, the op depends on a lot more than the variables,
        # but we'll keep the list short.
        with name_scope(name, 'sdca/minimize'):
            sparse_example_indices = []
            sparse_feature_indices = []
            sparse_features_values = []
            for sf in self._examples['sparse_features']:
                sparse_example_indices.append(sf.example_indices)
                sparse_feature_indices.append(sf.feature_indices)
                # If feature values are missing, sdca assumes a value of 1.0f.
                if sf.feature_values is not None:
                    sparse_features_values.append(sf.feature_values)

            # pylint: disable=protected-access
            example_ids_hashed = gen_sdca_ops.sdca_fprint(
                internal_convert_to_tensor(self._examples['example_ids']))
            # pylint: enable=protected-access
            example_state_data = self._hashtable.lookup(example_ids_hashed)
            # Solver returns example_state_update, new delta sparse_feature_weights
            # and delta dense_feature_weights.

            sparse_weights = []
            sparse_indices = []
            # If we have partitioned variables, keep a few dictionaries of Tensors
            # around that we need for the assign_add after the op call to
            # gen_sdca_ops.sdca_optimizer().  These are keyed because we may have a
            # mix of partitioned and un-partitioned variables.
            num_partitions_by_var = {}
            p_assignments_by_var = {}
            gather_ids_by_var = {}
            for v_num, (w, i) in enumerate(
                    zip(self._slots['unshrinked_sparse_features_weights'],
                        sparse_feature_indices)):
                # Append the sparse_indices (in full-variable space).
                sparse_idx = math_ops.cast(
                    array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
                    dtypes.int64)
                sparse_indices.append(sparse_idx)
                if isinstance(w, list) or isinstance(
                        w, var_ops.PartitionedVariable):
                    num_partitions = len(w)
                    flat_ids = array_ops.reshape(sparse_idx, [-1])
                    # We use div partitioning, which is easiest to support downstream.
                    # Compute num_total_ids as the sum of dim-0 of w, then assign
                    # to partitions based on a constant number of ids per partition.
                    # Optimize if we already know the full shape statically.
                    dim_0_size = self._get_first_dimension_size_statically(
                        w, num_partitions)

                    if tensor_shape.dimension_value(dim_0_size):
                        num_total_ids = constant_op.constant(
                            tensor_shape.dimension_value(dim_0_size),
                            flat_ids.dtype)
                    else:
                        dim_0_sizes = []
                        for p in range(num_partitions):
                            if tensor_shape.dimension_value(
                                    w[p].shape[0]) is not None:
                                dim_0_sizes.append(
                                    tensor_shape.dimension_value(
                                        w[p].shape[0]))
                            else:
                                with ops.colocate_with(w[p]):
                                    dim_0_sizes.append(
                                        array_ops.shape(w[p])[0])
                        num_total_ids = math_ops.reduce_sum(
                            math_ops.cast(array_ops.stack(dim_0_sizes),
                                          flat_ids.dtype))
                    ids_per_partition = num_total_ids // num_partitions
                    extras = num_total_ids % num_partitions

                    p_assignments = math_ops.maximum(
                        flat_ids // (ids_per_partition + 1),
                        (flat_ids - extras) // ids_per_partition)

                    # Emulate a conditional using a boolean indicator tensor
                    new_ids = array_ops.where(
                        p_assignments < extras,
                        flat_ids % (ids_per_partition + 1),
                        (flat_ids - extras) % ids_per_partition)

                    # Cast partition assignments to int32 for use in dynamic_partition.
                    # There really should not be more than 2^32 partitions.
                    p_assignments = math_ops.cast(p_assignments, dtypes.int32)
                    # Partition list of ids based on assignments into num_partitions
                    # separate lists.
                    gather_ids = data_flow_ops.dynamic_partition(
                        new_ids, p_assignments, num_partitions)
                    # Add these into the dictionaries for use in the later update.
                    num_partitions_by_var[v_num] = num_partitions
                    p_assignments_by_var[v_num] = p_assignments
                    gather_ids_by_var[v_num] = gather_ids

                    # Gather the weights from each partition.
                    partition_gathered_weights = []
                    for p in range(num_partitions):
                        with ops.colocate_with(w[p]):
                            partition_gathered_weights.append(
                                array_ops.gather(w[p], gather_ids[p]))

                    # Stitch the weights back together in the same order they were before
                    # we dynamic_partitioned them.
                    condition_indices = data_flow_ops.dynamic_partition(
                        math_ops.range(array_ops.shape(new_ids)[0]),
                        p_assignments, num_partitions)
                    batch_gathered_weights = data_flow_ops.dynamic_stitch(
                        condition_indices, partition_gathered_weights)
                else:
                    w_as_tensor = internal_convert_to_tensor(w)
                    with ops.device(w_as_tensor.device):
                        batch_gathered_weights = array_ops.gather(
                            w_as_tensor, sparse_idx)
                sparse_weights.append(batch_gathered_weights)

            # pylint: disable=protected-access
            if compat.forward_compatible(year=2018, month=10, day=30):
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptive=self._adaptive())
            else:
                esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
                    sparse_example_indices,
                    sparse_feature_indices,
                    sparse_features_values,
                    self._convert_n_to_tensor(
                        self._examples['dense_features']),
                    internal_convert_to_tensor(
                        self._examples['example_weights']),
                    internal_convert_to_tensor(
                        self._examples['example_labels']),
                    sparse_indices,
                    sparse_weights,
                    self._convert_n_to_tensor(
                        self._slots['unshrinked_dense_features_weights']),
                    example_state_data,
                    loss_type=self._options['loss_type'],
                    l1=self._options['symmetric_l1_regularization'],
                    l2=self._symmetric_l2_regularization(),
                    num_loss_partitions=self._num_loss_partitions(),
                    num_inner_iterations=1,
                    adaptative=self._adaptive())
            # pylint: enable=protected-access

            with ops.control_dependencies([esu]):
                update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
                # Update the weights before the proximal step.
                for v_num, (w, i, u) in enumerate(
                        zip(self._slots['unshrinked_sparse_features_weights'],
                            sparse_indices, sfw)):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        update_ops += self._get_partitioned_update_ops(
                            v_num, num_partitions_by_var, p_assignments_by_var,
                            gather_ids_by_var, w, u, p_assignments,
                            num_partitions)
                    else:
                        update_ops.append(state_ops.scatter_add(w, i, u))
                for w, u in zip(
                        self._slots['unshrinked_dense_features_weights'], dfw):
                    if (isinstance(w, var_ops.PartitionedVariable)
                            or isinstance(w, list)):
                        split_updates = array_ops.split(
                            u,
                            num_or_size_splits=[
                                v.shape.as_list()[0] for v in w
                            ])
                        for v, split_update in zip(w, split_updates):
                            update_ops.append(
                                state_ops.assign_add(v, split_update))
                    else:
                        update_ops.append(state_ops.assign_add(w, u))
            if not global_step:
                return control_flow_ops.group(*update_ops)
            with ops.control_dependencies(update_ops):
                return state_ops.assign_add(global_step, 1, name=name).op
Exemple #42
0
 def _apply_sparse(self, grad, var):  #only use for LSTM
     return self._apply_sparse_shared(
         grad.values, var, grad.indices,
         lambda x, i, v: state_ops.scatter_add(x, i, v))
  def _mini_batch_training_op(self, inputs, cluster_idx_list, cluster_centers,
                              cluster_centers_var, total_counts):
    """Creates an op for training for mini batch case.

    Args:
      inputs: list of input Tensors.
      cluster_idx_list: A vector (or list of vectors). Each element in the
        vector corresponds to an input row in 'inp' and specifies the cluster id
        corresponding to the input.
      cluster_centers: Tensor of cluster centers, possibly normalized.
      cluster_centers_var: Tensor Ref of cluster centers.
      total_counts: Tensor Ref of cluster counts.

    Returns:
      An op for doing an update of mini-batch k-means.
    """
    update_ops = []
    for inp, cluster_idx in zip(inputs, cluster_idx_list):
      with ops.colocate_with(inp):
        assert total_counts is not None
        cluster_idx = array_ops.reshape(cluster_idx, [-1])
        # Dedupe the unique ids of cluster_centers being updated so that updates
        # can be locally aggregated.
        unique_ids, unique_idx = array_ops.unique(cluster_idx)
        num_unique_cluster_idx = array_ops.size(unique_ids)
        # Fetch the old values of counts and cluster_centers.
        with ops.colocate_with(total_counts):
          old_counts = array_ops.gather(total_counts, unique_ids)
        with ops.colocate_with(cluster_centers):
          old_cluster_centers = array_ops.gather(cluster_centers, unique_ids)
        # Locally aggregate the increment to counts.
        count_updates = math_ops.unsorted_segment_sum(
            array_ops.ones_like(
                unique_idx, dtype=total_counts.dtype),
            unique_idx,
            num_unique_cluster_idx)
        # Locally compute the sum of inputs mapped to each id.
        # For a cluster with old cluster value x, old count n, and with data
        # d_1,...d_k newly assigned to it, we recompute the new value as
        # x += (sum_i(d_i) - k * x) / (n + k).
        # Compute sum_i(d_i), see comment above.
        cluster_center_updates = math_ops.unsorted_segment_sum(
            inp, unique_idx, num_unique_cluster_idx)
        # Shape to enable broadcasting count_updates and learning_rate to inp.
        # It extends the shape with 1's to match the rank of inp.
        broadcast_shape = array_ops.concat(
            [
                array_ops.reshape(num_unique_cluster_idx, [1]), array_ops.ones(
                    array_ops.reshape(array_ops.rank(inp) - 1, [1]),
                    dtype=dtypes.int32)
            ],
            0)
        # Subtract k * x, see comment above.
        cluster_center_updates -= math_ops.cast(
            array_ops.reshape(count_updates, broadcast_shape),
            inp.dtype) * old_cluster_centers
        learning_rate = math_ops.reciprocal(
            math_ops.cast(old_counts + count_updates, inp.dtype))
        learning_rate = array_ops.reshape(learning_rate, broadcast_shape)
        # scale by 1 / (n + k), see comment above.
        cluster_center_updates *= learning_rate
        # Apply the updates.
      update_counts = state_ops.scatter_add(total_counts, unique_ids,
                                            count_updates)
      update_cluster_centers = state_ops.scatter_add(cluster_centers_var,
                                                     unique_ids,
                                                     cluster_center_updates)
      update_ops.extend([update_counts, update_cluster_centers])
    return control_flow_ops.group(*update_ops)
Exemple #44
0
 def scatter_add(x, i, v):
     return state_ops.scatter_add(x,
                                  i,
                                  v,
                                  use_locking=self._use_locking)
Exemple #45
0
  def _apply_gradient(self, grad, var, indices=None):
    """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
    global_step = self._global_step + 1

    # Update accumulated weighted average of gradients
    gbar = self.get_slot(var, "gbar")
    gbar_decay_t = GetParam(self._gbar_decay, global_step)
    gbar_weight_t = GetParam(self._gbar_weight, global_step)
    if indices is not None:
      # Note - the sparse update is not easily implemented, since the
      # algorithm needs all indices of gbar to be updated
      # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
      # One way to make mat_gbar_decay = 1 is by rescaling.
      # If we want the update:
      #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
      # define:
      #         r_{t+1} = a_{t+1} * r_t
      #         h_t = G_t / r_t
      # Then:
      #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
      # So we get the mat_gbar_decay = 1 as desired.
      # We can implement this in a future version as needed.
      # However we still need gbar_decay = 0, otherwise all indices
      # of the variable will need to be updated.
      if self._gbar_decay != 0.0:
        tf_logging.warning("Not applying momentum for variable: %s" % var.name)
      gbar_updated = grad
    else:
      gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                            gbar_decay_t,
                                            gbar_weight_t * grad)

    # Update the preconditioners and compute the preconditioned gradient
    shape = var.get_shape()
    mat_g_list = []
    for i in range(len(shape)):
      mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
    mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
    mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

    preconditioned_grad = gbar_updated
    v_rank = len(mat_g_list)
    neg_alpha = - GetParam(self._alpha, global_step) / v_rank
    svd_interval = GetParam(self._svd_interval, global_step)
    precond_update_interval = GetParam(self._precond_update_interval,
                                       global_step)
    for i, mat_g in enumerate(mat_g_list):
      # axes is the list of indices to reduce - everything but the current i.
      axes = list(range(i)) + list(range(i+1, v_rank))
      if shape[i] <= self._max_matrix_size:
        # If the tensor size is sufficiently small perform full Shampoo update
        # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
        # is not strictly correct. However we will use it for now, and
        # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

        # pylint: disable=g-long-lambda,cell-var-from-loop
        mat_g_updated = control_flow_ops.cond(
            math_ops.mod(global_step, precond_update_interval) < 1,
            lambda: self._update_mat_g(
                mat_g, grad, axes, mat_gbar_decay_t,
                mat_gbar_weight_t * precond_update_interval, i),
            lambda: mat_g)

        if self._svd_interval == 1:
          mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha)
        else:
          mat_h = control_flow_ops.cond(
              math_ops.mod(global_step, svd_interval) < 1,
              lambda: self._compute_power(var, mat_g_updated, shape[i],
                                          neg_alpha, "H_" + str(i)),
              lambda: self.get_slot(var, "H_" + str(i)))

        # mat_h is a square matrix of size d_i x d_i
        # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
        # After contraction with a d_i x d_i tensor
        # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
        # (the first dimension is contracted out, and the second dimension of
        # mat_h is appended).  After going through all the indices, it becomes
        # a d_0 x ... x d_n tensor again.
        preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h,
                                                 axes=([0], [0]),
                                                 name="precond_" + str(i))
      else:
        # Tensor size is too large -- perform diagonal Shampoo update
        grad_outer = math_ops.reduce_sum(grad * grad, axis=axes)
        if i == 0 and indices is not None:
          assert self._mat_gbar_decay == 1.0
          mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(
              array_ops.gather(mat_g_updated, indices) + self._epsilon,
              neg_alpha)
        else:
          mat_g_updated = self._weighted_average(mat_g,
                                                 self._mat_gbar_decay,
                                                 mat_gbar_decay_t,
                                                 mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha)

        # Need to do the transpose to ensure that the tensor becomes
        # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
        preconditioned_grad = array_ops.transpose(
            preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h

    # Update the variable based on the Shampoo update
    learning_rate_t = GetParam(self._learning_rate, global_step)
    if indices is not None:
      var_updated = state_ops.scatter_add(
          var, indices, -learning_rate_t * preconditioned_grad)
    else:
      var_updated = state_ops.assign_sub(var,
                                         learning_rate_t * preconditioned_grad)
    return var_updated
Exemple #46
0
  def minimize(self, global_step=None, name=None):
    """Add operations to train a linear model by minimizing the loss function.

    Args:
      global_step: Optional `Variable` to increment by one after the
        variables have been updated.
      name: Optional name for the returned operation.

    Returns:
      An Operation that updates the variables passed in the constructor.
    """
    # Technically, the op depends on a lot more than the variables,
    # but we'll keep the list short.
    with name_scope(name, 'sdca/minimize'):
      sparse_example_indices = []
      sparse_feature_indices = []
      sparse_features_values = []
      for sf in self._examples['sparse_features']:
        sparse_example_indices.append(sf.example_indices)
        sparse_feature_indices.append(sf.feature_indices)
        # If feature values are missing, sdca assumes a value of 1.0f.
        if sf.feature_values is not None:
          sparse_features_values.append(sf.feature_values)

      # pylint: disable=protected-access
      example_ids_hashed = gen_sdca_ops.sdca_fprint(
          internal_convert_to_tensor(self._examples['example_ids']))
      # pylint: enable=protected-access
      example_state_data = self._hashtable.lookup(example_ids_hashed)
      # Solver returns example_state_update, new delta sparse_feature_weights
      # and delta dense_feature_weights.

      sparse_weights = []
      sparse_indices = []
      # If we have partitioned variables, keep a few dictionaries of Tensors
      # around that we need for the assign_add after the op call to
      # gen_sdca_ops.sdca_optimizer().  These are keyed because we may have a
      # mix of partitioned and un-partitioned variables.
      num_partitions_by_var = {}
      p_assignments_by_var = {}
      gather_ids_by_var = {}
      for v_num, (w, i) in enumerate(
          zip(self._slots['unshrinked_sparse_features_weights'],
              sparse_feature_indices)):
        # Append the sparse_indices (in full-variable space).
        sparse_idx = math_ops.cast(
            array_ops.unique(math_ops.cast(i, dtypes.int32))[0],
            dtypes.int64)
        sparse_indices.append(sparse_idx)
        if isinstance(w, list) or isinstance(w, var_ops.PartitionedVariable):
          num_partitions = len(w)
          flat_ids = array_ops.reshape(sparse_idx, [-1])
          # We use div partitioning, which is easiest to support downstream.
          # Compute num_total_ids as the sum of dim-0 of w, then assign
          # to partitions based on a constant number of ids per partition.
          # Optimize if we already know the full shape statically.
          dim_0_size = self._get_first_dimension_size_statically(
              w, num_partitions)

          if tensor_shape.dimension_value(dim_0_size):
            num_total_ids = constant_op.constant(
                tensor_shape.dimension_value(dim_0_size),
                flat_ids.dtype)
          else:
            dim_0_sizes = []
            for p in range(num_partitions):
              if tensor_shape.dimension_value(w[p].shape[0]) is not None:
                dim_0_sizes.append(tensor_shape.dimension_value(w[p].shape[0]))
              else:
                with ops.colocate_with(w[p]):
                  dim_0_sizes.append(array_ops.shape(w[p])[0])
            num_total_ids = math_ops.reduce_sum(
                math_ops.cast(array_ops.stack(dim_0_sizes), flat_ids.dtype))
          ids_per_partition = num_total_ids // num_partitions
          extras = num_total_ids % num_partitions

          p_assignments = math_ops.maximum(
              flat_ids // (ids_per_partition + 1),
              (flat_ids - extras) // ids_per_partition)

          # Emulate a conditional using a boolean indicator tensor
          new_ids = array_ops.where(p_assignments < extras,
                                    flat_ids % (ids_per_partition + 1),
                                    (flat_ids - extras) % ids_per_partition)

          # Cast partition assignments to int32 for use in dynamic_partition.
          # There really should not be more than 2^32 partitions.
          p_assignments = math_ops.cast(p_assignments, dtypes.int32)
          # Partition list of ids based on assignments into num_partitions
          # separate lists.
          gather_ids = data_flow_ops.dynamic_partition(new_ids,
                                                       p_assignments,
                                                       num_partitions)
          # Add these into the dictionaries for use in the later update.
          num_partitions_by_var[v_num] = num_partitions
          p_assignments_by_var[v_num] = p_assignments
          gather_ids_by_var[v_num] = gather_ids

          # Gather the weights from each partition.
          partition_gathered_weights = []
          for p in range(num_partitions):
            with ops.colocate_with(w[p]):
              partition_gathered_weights.append(
                  array_ops.gather(w[p], gather_ids[p]))

          # Stitch the weights back together in the same order they were before
          # we dynamic_partitioned them.
          condition_indices = data_flow_ops.dynamic_partition(
              math_ops.range(array_ops.shape(new_ids)[0]),
              p_assignments, num_partitions)
          batch_gathered_weights = data_flow_ops.dynamic_stitch(
              condition_indices, partition_gathered_weights)
        else:
          w_as_tensor = internal_convert_to_tensor(w)
          with ops.device(w_as_tensor.device):
            batch_gathered_weights = array_ops.gather(
                w_as_tensor, sparse_idx)
        sparse_weights.append(batch_gathered_weights)

      # pylint: disable=protected-access
      if compat.forward_compatible(year=2018, month=10, day=30):
        esu, sfw, dfw = gen_sdca_ops.sdca_optimizer_v2(
            sparse_example_indices,
            sparse_feature_indices,
            sparse_features_values,
            self._convert_n_to_tensor(self._examples['dense_features']),
            internal_convert_to_tensor(self._examples['example_weights']),
            internal_convert_to_tensor(self._examples['example_labels']),
            sparse_indices,
            sparse_weights,
            self._convert_n_to_tensor(self._slots[
                'unshrinked_dense_features_weights']),
            example_state_data,
            loss_type=self._options['loss_type'],
            l1=self._options['symmetric_l1_regularization'],
            l2=self._symmetric_l2_regularization(),
            num_loss_partitions=self._num_loss_partitions(),
            num_inner_iterations=1,
            adaptive=self._adaptive())
      else:
        esu, sfw, dfw = gen_sdca_ops.sdca_optimizer(
            sparse_example_indices,
            sparse_feature_indices,
            sparse_features_values,
            self._convert_n_to_tensor(self._examples['dense_features']),
            internal_convert_to_tensor(self._examples['example_weights']),
            internal_convert_to_tensor(self._examples['example_labels']),
            sparse_indices,
            sparse_weights,
            self._convert_n_to_tensor(self._slots[
                'unshrinked_dense_features_weights']),
            example_state_data,
            loss_type=self._options['loss_type'],
            l1=self._options['symmetric_l1_regularization'],
            l2=self._symmetric_l2_regularization(),
            num_loss_partitions=self._num_loss_partitions(),
            num_inner_iterations=1,
            adaptative=self._adaptive())
      # pylint: enable=protected-access

      with ops.control_dependencies([esu]):
        update_ops = [self._hashtable.insert(example_ids_hashed, esu)]
        # Update the weights before the proximal step.
        for v_num, (w, i, u) in enumerate(
            zip(self._slots['unshrinked_sparse_features_weights'],
                sparse_indices, sfw)):
          if (isinstance(w, var_ops.PartitionedVariable) or
              isinstance(w, list)):
            update_ops += self._get_partitioned_update_ops(
                v_num, num_partitions_by_var, p_assignments_by_var,
                gather_ids_by_var, w, u, p_assignments, num_partitions)
          else:
            update_ops.append(state_ops.scatter_add(w, i, u))
        for w, u in zip(self._slots['unshrinked_dense_features_weights'], dfw):
          if (isinstance(w, var_ops.PartitionedVariable) or
              isinstance(w, list)):
            split_updates = array_ops.split(
                u, num_or_size_splits=[v.shape.as_list()[0] for v in w])
            for v, split_update in zip(w, split_updates):
              update_ops.append(state_ops.assign_add(v, split_update))
          else:
            update_ops.append(state_ops.assign_add(w, u))
      if not global_step:
        return control_flow_ops.group(*update_ops)
      with ops.control_dependencies(update_ops):
        return state_ops.assign_add(global_step, 1, name=name).op