def testVarChange(self):
   with imperative_mode.ImperativeMode(self._target) as mode:
     x = variables.Variable(constant_op.constant(1.0))
     for i in range(10):
       with mode.new_step() as step:
         step.run(state_ops.assign_sub(x, 0.1))
         self.assertAllClose(array_ops.identity(x).value, 1.0 - (i + 1) * 0.1)
def assign_moving_average(variable, value, decay, name=None):
  """Compute the moving average of a variable.

  The moving average of 'variable' updated with 'value' is:
    variable * decay + value * (1 - decay)

  The returned Operation sets 'variable' to the newly computed moving average.

  The new value of 'variable' can be set with the 'AssignSub' op as:
     variable -= (1 - decay) * (variable - value)

  Args:
    variable: A Variable.
    value: A tensor with the same shape as 'variable'
    decay: A float Tensor or float value.  The moving average decay.
    name: Optional name of the returned operation.

  Returns:
    An Operation that updates 'variable' with the newly computed
    moving average.
  """
  with ops.op_scope([variable, value, decay], name, "AssignMovingAvg") as scope:
    with ops.device(variable.device):
      decay = ops.convert_to_tensor(1.0 - decay, name="decay")
      if decay.dtype != variable.dtype.base_dtype:
        decay = math_ops.cast(decay, variable.dtype.base_dtype)
      return state_ops.assign_sub(variable, (variable - value) * decay,
                                  name=scope)
Exemple #3
0
    def _apply_dense(self, grad, var):
        beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))

        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, beta1_t * m + m_scaled_g_values, use_locking=self._use_locking)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, beta2_t * v + v_scaled_g_values, use_locking=self._use_locking)

        # amsgrad
        vhat = self.get_slot(var, "vhat")
        vhat_t = state_ops.assign(vhat, math_ops.maximum(v_t, vhat))
        v_sqrt = math_ops.sqrt(vhat_t)

        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, m_t, v_t, vhat_t])
Exemple #4
0
 def _apply_sparse_shared(self, grad, var, indices, scatter_add):
   beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
   beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
   lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
   beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
   beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
   epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
   lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
   # m_t = beta1 * m + (1 - beta1) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = grad * (1 - beta1_t)
   m_t = state_ops.assign(m, m * beta1_t,
                          use_locking=self._use_locking)
   with ops.control_dependencies([m_t]):
     m_t = scatter_add(m, indices, m_scaled_g_values)
   # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
   v = self.get_slot(var, "v")
   v_scaled_g_values = (grad * grad) * (1 - beta2_t)
   v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
   with ops.control_dependencies([v_t]):
     v_t = scatter_add(v, indices, v_scaled_g_values)
   v_sqrt = math_ops.sqrt(v_t)
   var_update = state_ops.assign_sub(var,
                                     lr * m_t / (v_sqrt + epsilon_t),
                                     use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
  """Compute the moving average of a variable.

  The moving average of 'variable' updated with 'value' is:
    variable * decay + value * (1 - decay)

  The returned Operation sets 'variable' to the newly computed moving average.

  The new value of 'variable' can be set with the 'AssignSub' op as:
     variable -= (1 - decay) * (variable - value)

  Since variables that are initialized to a `0` value will be `0` biased,
  `zero_debias` optionally enables scaling by the mathematically correct
  debiasing factor of
    1 - decay ** num_updates
  See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
  (https://arxiv.org/abs/1412.6980).

  The names of the debias shadow variables, by default, include both the scope
  they were created in and the scope of the variables they debias. They are also
  given a uniqifying-suffix.

  E.g.:

  ```
    with tf.variable_scope('scope1'):
      with tf.variable_scope('scope2'):
        var = tf.get_variable('foo')
        tf.assign_moving_average(var, 0.0, 1.0)
        tf.assign_moving_average(var, 0.0, 0.9)

    # var.name: 'scope1/scope2/foo'
    # shadow var names: 'scope1/scope2/scope1/scope2/foo/biased'
    #                   'scope1/scope2/scope1/scope2/foo/biased_1'
  ```

  Args:
    variable: A Variable.
    value: A tensor with the same shape as 'variable'.
    decay: A float Tensor or float value.  The moving average decay.
    zero_debias: A python bool. If true, assume the variable is 0-initialized
      and unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
      `_zero_debias` for more details.
    name: Optional name of the returned operation.

  Returns:
    A reference to the input 'variable' tensor with the newly computed
    moving average.
  """
  with ops.name_scope(name, "AssignMovingAvg",
                      [variable, value, decay]) as scope:
    with ops.colocate_with(variable):
      decay = ops.convert_to_tensor(1.0 - decay, name="decay")
      if decay.dtype != variable.dtype.base_dtype:
        decay = math_ops.cast(decay, variable.dtype.base_dtype)
      if zero_debias:
        update_delta = _zero_debias(variable, value, decay)
      else:
        update_delta = (variable - value) * decay
      return state_ops.assign_sub(variable, update_delta, name=scope)
 def _Update_global_variables():
   local_vars = [v for g, v in grads_and_vars if g is not None]
   global_center_vars = [self._global_map[var] for var in local_vars]
   local_center_vars = [self._local_map[var] for var in local_vars]
   local_center_vars_update = []
   for lvar, var in zip(local_center_vars, global_center_vars):
     local_center_vars_update.append(lvar.assign(var))
   update_ops = []
   differences = []
   with ops.control_dependencies(local_center_vars_update):
     for v, lv in zip(local_vars, local_center_vars):
       with ops.device(v.device):
         differences.append(math_ops.subtract(v, lv))
     for lvar, diff in zip(local_vars, differences):
       with ops.device(lvar.device):
         update_ops.append(
             state_ops.assign_sub(lvar,
                                  math_ops.multiply(self._moving_rate,
                                                    diff)))
     for var, diff in zip(global_center_vars, differences):
       with ops.device(var.device):
         update_ops.append(
             state_ops.assign_add(var,
                                  math_ops.multiply(self._moving_rate,
                                                    diff)))
     if global_step:
       with ops.colocate_with(global_step):
         update_ops.append(state_ops.assign_add(global_step, 1))
   variable_update = control_flow_ops.group(*(update_ops))
   return variable_update
  def testReadWrite(self):
    """Tests initialization, reading, and writing a resource variable."""
    for dtype in self.numeric_types:
      with self.test_session() as session:
        with self.test_scope():
          with variable_scope.variable_scope("ascope", use_resource=True):
            x = variable_scope.get_variable(
                "x",
                shape=[],
                dtype=dtype,
                initializer=init_ops.constant_initializer(2))
            a = x.read_value()
            with ops.control_dependencies([a]):
              b = state_ops.assign(x, dtype(47))
            with ops.control_dependencies([b]):
              c = x.read_value()
            with ops.control_dependencies([c]):
              d = state_ops.assign_add(x, np.array(6 + 2j).astype(dtype))
            with ops.control_dependencies([d]):
              e = state_ops.assign_sub(x, dtype(3))
            with ops.control_dependencies([e]):
              f = x.read_value()

        session.run(variables.global_variables_initializer())
        v1, v2, v3 = session.run([a, c, f])
        self.assertAllClose(dtype(2), v1)
        self.assertAllClose(dtype(47), v2)
        self.assertAllClose(np.array(50 + 2j).astype(dtype), v3)
Exemple #8
0
  def _apply_sparse(self, grad, var):
    beta1_power = math_ops.cast(self._beta1_power, var.dtype.base_dtype)
    beta2_power = math_ops.cast(self._beta2_power, var.dtype.base_dtype)
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta2_power) / (1 - beta1_power))
    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, "m")
    m_scaled_g_values = grad.values * (1 - beta1_t)
    m_t = state_ops.assign(m, m * beta1_t,
                           use_locking=self._use_locking)
    m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values,
                               use_locking=self._use_locking)



    # u_t = max(beta_2 * u_{t-1}, L1(g_t))
    # theta_t = theta_{t-1} - alpha/(1-beta_1).m_t/u_t

    v = self.get_slot(var, "v")
    g_abs_values = tensorflow.abs(g_t)
    v_t = state_ops.assign(v, v * beta_2, use_locking = self._use_locking)
    v_t = state_ops.assign_max(v_t, grad.indices, g_abs_values,
                             use_locking=self._use_locking)
    var_update = state_ops.assign_sub(var,
                                      lr*m_t/(v_t*(1 - beta_1)),
                                      use_locking=self._use_locking)

    return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #9
0
  def _resource_apply_sparse(self, grad, var, indices):
    var_dtype = var.dtype.base_dtype
    lr_t = self._decayed_lr(var_dtype)
    beta_1_t = self._get_hyper('beta_1', var_dtype)
    beta_2_t = self._get_hyper('beta_2', var_dtype)
    local_step = math_ops.cast(self.iterations + 1, var_dtype)
    beta_1_power = math_ops.pow(beta_1_t, local_step)
    beta_2_power = math_ops.pow(beta_2_t, local_step)
    epsilon_t = self._get_hyper('epsilon', var_dtype)
    lr = (lr_t * math_ops.sqrt(1 - beta_2_power) / (1 - beta_1_power))

    # m_t = beta1 * m + (1 - beta1) * g_t
    m = self.get_slot(var, 'm')
    m_scaled_g_values = grad * (1 - beta_1_t)
    m_t = state_ops.assign(m, m * beta_1_t, use_locking=self._use_locking)
    with ops.control_dependencies([m_t]):
      m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

    # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
    v = self.get_slot(var, 'v')
    v_scaled_g_values = (grad * grad) * (1 - beta_2_t)
    v_t = state_ops.assign(v, v * beta_2_t, use_locking=self._use_locking)
    with ops.control_dependencies([v_t]):
      v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

    v_sqrt = math_ops.sqrt(v_t)
    var_update = state_ops.assign_sub(
        var, lr * m_t / (v_sqrt + epsilon_t), use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, name=None):
    with ops.op_scope([variable, value, decay], name, "AssignMovingAvg") as name:
        with ops.device(variable.device):
            decay = ops.convert_to_tensor(1.0 - decay, name="decay")
            if decay.dtype != variable.dtype.base_dtype:
                decay = math_ops.cast(decay, variable.dtype.base_dtype)
            return state_ops.assign_sub(variable, (variable - value) * decay,
                                        name=name)
 def _initAssignSubFetch(self, x, y, use_gpu=False):
   """Initialize a param to init, and compute param -= y."""
   with self.test_session(use_gpu=use_gpu):
     p = variables.Variable(x)
     sub = state_ops.assign_sub(p, y)
     p.initializer.run()
     new_value = sub.eval()
     return p.eval(), new_value
 def _assign_moving_average(self, variable, value, momentum):
   with ops.name_scope(None, 'AssignMovingAvg',
                       [variable, value, momentum]) as scope:
     decay = ops.convert_to_tensor(1.0 - momentum, name='decay')
     if decay.dtype != variable.dtype.base_dtype:
       decay = math_ops.cast(decay, variable.dtype.base_dtype)
     update_delta = (variable - value) * decay
     return state_ops.assign_sub(variable, update_delta, name=scope)
 def update_fn(v, value, decay=decay):
   decay = ops.convert_to_tensor(1.0 - decay, name="decay")
   if decay.dtype != v.dtype.base_dtype:
     decay = math_ops.cast(decay, v.dtype.base_dtype)
   if zero_debias:
     update_delta = _zero_debias(v, value, decay)
   else:
     update_delta = (v - value) * decay
   return state_ops.assign_sub(v, update_delta, name=scope)
Exemple #14
0
    def _apply_dense(self, grad, var):
        lr = self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - self._beta1_t)
        m_t = m * self._beta1_t
        m_t = m_t + m_scaled_g_values
        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = tf.pow(grad, 2) * (1 - self._beta2_t)
        v_t = v * self._beta2_t
        v_t = v_t + v_scaled_g_values
        v_sqrt = tf.pow(v_t, self._pow_t)
        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking)
        # regularization
        var_update = state_ops.assign_sub(var_update, self._dense_regularization * var, use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t])
  def update_fn(v, value, biased_var, local_step):
    update_biased = state_ops.assign_sub(biased_var,
                                         (biased_var - value) * decay)
    update_local_step = local_step.assign_add(1)

    # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
    bias_factor = 1 - math_ops.pow(1.0 - decay, update_local_step)
    return state_ops.assign(
        v, update_biased / bias_factor, name=ops.get_name_scope() + "/")
Exemple #16
0
    def _apply_sparse(self, grad, var):
        lr = self._lr_t * math_ops.sqrt(1 - self._beta2_power) / (1 - self._beta1_power)
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_scaled_g_values = grad.values * (1 - self._beta1_t)
        m_t = state_ops.assign(m, m * self._beta1_t, use_locking=self._use_locking)

        m_t = state_ops.scatter_add(m_t, grad.indices, m_scaled_g_values, use_locking=self._use_locking)
        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad.values * grad.values) * (1 - self._beta2_t)
        v_t = state_ops.assign(v, v * self._beta2_t, use_locking=self._use_locking)
        v_t = state_ops.scatter_add(v_t, grad.indices, v_scaled_g_values, use_locking=self._use_locking)
        v_sqrt = tf.pow(v_t, self._pow_t)
        var_update = state_ops.assign_sub(var, lr * m_t / (v_sqrt + self._epsilon_t), use_locking=self._use_locking)
        # regularization
        var_update = state_ops.assign_sub(var_update, self._sparse_regularization * var, use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #17
0
  def _resource_apply_dense(self, grad, var):
    var_dtype = var.dtype.base_dtype
    lr_t = self._decayed_lr(var_dtype)
    epsilon = self._get_hyper('epsilon', var_dtype)
    acc = self.get_slot(var, 'accumulator')

    acc_t = state_ops.assign_add(
        acc, math_ops.square(grad), use_locking=self._use_locking)
    var_update = state_ops.assign_sub(
        var, lr_t * grad / (math_ops.sqrt(acc_t) + epsilon))
    return var_update
Exemple #18
0
 def _assign_moving_average(self, variable, value, one_minus_decay):
   with ops.name_scope(None, 'AssignMovingAvg',
                       [variable, value, one_minus_decay]) as scope:
     with ops.colocate_with(variable):
       update_delta = (variable.read_value() - value) * one_minus_decay
       if isinstance(variable, resource_variable_ops.ResourceVariable):
         # state_ops.assign_sub does an extra read_variable_op after the
         # assign. We avoid that here.
         return gen_resource_variable_ops.assign_sub_variable_op(
             variable.handle, update_delta, name=scope)
       else:
         return state_ops.assign_sub(variable, update_delta, name=scope)
Exemple #19
0
    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
        alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)

        eps = 1e-7  # cap for moving average

        m = self.get_slot(var, "m")
        m_t = m.assign(tf.maximum(beta_t * m + eps, tf.abs(grad)))

        var_update = state_ops.assign_sub(var, lr_t * grad * (1.0 + alpha_t * tf.sign(grad) * tf.sign(m_t)))
        # Create an op that groups multiple operations
        # When this op finishes, all ops in input have finished
        return control_flow_ops.group(*[var_update, m_t])
Exemple #20
0
    def assign_sub(self, delta, use_locking=False):
        """Subtracts a value from this variable.

    This is essentially a shortcut for `assign_sub(self, delta)`.

    Args:
      delta: A `Tensor`. The value to subtract from this variable.
      use_locking: If `True`, use locking during the operation.

    Returns:
      A `Tensor` that will hold the new value of this variable after
      the subtraction has completed.
    """
        return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)
 def _apply_dense(self, grad, var):
   # m_t = mu * m + (1 - mu) * g_t
   m = self.get_slot(var, "m")
   m_scaled_g_values = grad * (1 - self._mu_t)
   m_t = state_ops.assign(m, m * self._mu_t,
                          use_locking=self._use_locking)
   m_t = state_ops.assign_add(m_t, m_scaled_g_values,
                              use_locking=self._use_locking)
   m_t_ = m_t / (1 - self._mu2_t * self._mu_power)
   # m_bar = mu * m_t + (1 - mu) * g_t
   m_bar = self._mu2_t * m_t_ + m_scaled_g_values / (1 - self._mu_power)
   var_update = state_ops.assign_sub(var,
                                    self._lr_t * m_bar,
                                    use_locking=self._use_locking)
   return control_flow_ops.group(*[var_update, m_t])
Exemple #22
0
    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        if var.dtype.base_dtype == tf.float16:
            eps = 1e-7  # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
        else:
            eps = 1e-8

        v = self.get_slot(var, "v")
        v_t = v.assign(beta1_t * v + (1. - beta1_t) * grad)
        m = self.get_slot(var, "m")
        m_t = m.assign(tf.maximum(beta2_t * m + eps, tf.abs(grad)))
        g_t = v_t / m_t

        var_update = state_ops.assign_sub(var, lr_t * g_t)
        return control_flow_ops.group(*[var_update, m_t, v_t])
def assign_moving_average(variable, value, decay, zero_debias=True, name=None):
  """Compute the moving average of a variable.

  The moving average of 'variable' updated with 'value' is:
    variable * decay + value * (1 - decay)

  The returned Operation sets 'variable' to the newly computed moving average.

  The new value of 'variable' can be set with the 'AssignSub' op as:
     variable -= (1 - decay) * (variable - value)

  Since variables that are initialized to a `0` value will be `0` biased,
  `zero_debias` optionally enables scaling by the mathematically correct
  debiasing factor of
    1 - decay ** num_updates
  See `ADAM: A Method for Stochastic Optimization` Section 3 for more details
  (https://arxiv.org/abs/1412.6980).

  Args:
    variable: A Variable.
    value: A tensor with the same shape as 'variable'.
    decay: A float Tensor or float value.  The moving average decay.
    zero_debias: A python bool. If true, assume the variable is 0-intialized and
      unbias it, as in https://arxiv.org/abs/1412.6980. See docstring in
      `_zero_debias` for more details.
    name: Optional name of the returned operation.

  Returns:
    An Operation that updates 'variable' with the newly computed
    moving average.
  """
  with ops.name_scope(name, "AssignMovingAvg",
                      [variable, value, decay]) as scope:
    with ops.colocate_with(variable):
      decay = ops.convert_to_tensor(1.0 - decay, name="decay")
      if decay.dtype != variable.dtype.base_dtype:
        decay = math_ops.cast(decay, variable.dtype.base_dtype)
      if zero_debias:
        update_delta = _zero_debias(variable, value, decay)
      else:
        update_delta = (variable - value) * decay
      return state_ops.assign_sub(variable, update_delta, name=scope)
Exemple #24
0
  def _apply_sparse(self, grad, var):
    # ms_t = decay * ms + (1 - decay) * (g_t * g_t)
    ms = self.get_slot(var, "rms") # should not be named rms when it's ms
    print('---SPARSE TIME---')
    print('lr: ' + str(self._learning_rate_tensor.get_shape()))
    print('decay: ' + str(self._decay_tensor.get_shape()))
    print('momentum: ' + str(self._momentum_tensor.get_shape()))
    print('epsilon: ' + str(self._epsilon_tensor.get_shape()))
    print('ms: ' + str(ms.get_shape()))
    print('grad.values: ' + str(grad.values.get_shape()))
    ms_scaled_g_values = (grad.values * grad.values) * \
                         (1 - self._decay_tensor)
    print('ms_scaled_g_values:' + str(ms_scaled_g_values.get_shape()))
    # no clue what these ops does
    ms_t = state_ops.assign(ms, ms * self._decay_tensor,
                            use_locking=self._use_locking)
    print('ms_t: ' + str(ms_t.get_shape()))
    ms_t = state_ops.scatter_add(ms_t, grad.indices, ms_scaled_g_values,
                                 use_locking=self._use_locking)
    print('ms_t: ' + str(ms_t.get_shape()))
    rms = math_ops.sqrt(ms_t)
    print('rms: ' + str(rms.get_shape()))
    rms += self._epsilon_tensor
    print('rms: ' + str(rms.get_shape()))
    mom = self.get_slot(var, "momentum")
    print('mom: ' + str(mom.get_shape()))
    sparse_grad = self.get_slot(var, "sparse_grad")
    sparse_grad_t = state_ops.assign(sparse_grad, sparse_grad, use_locking=self._use_locking)
    sparse_grad_t = state_ops.scatter_add(sparse_grad, grad.indices, grad.values*self._learning_rate, use_locking=self._use_locking)
    mom_scaled_g_values = sparse_grad_t / rms
    print('mom_scaled_g_values: ' + str(mom.get_shape()))
    mom_t = state_ops.assign(mom, mom * self._momentum_tensor,
                             use_locking=self._use_locking)
    print('mom_t: ' + str(mom_t.get_shape()))
    mom_t += mom_scaled_g_values
#    mom_t = state_ops.scatter_add(mom_t, grad.indices, mom_scaled_g_values,
#                                  use_locking=self._use_locking)
    print('mom_t: ' + str(mom_t.get_shape()))
    var_update = state_ops.assign_sub(var, mom_t,
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, ms_t, mom_t])
  def _mini_batch_sync_updates_op(self, update_in_steps, cluster_centers_var,
                                  cluster_centers_updated, total_counts):
    if self._use_mini_batch and self._mini_batch_steps_per_iteration > 1:
      assert update_in_steps is not None
      with ops.colocate_with(update_in_steps):

        def _f():
          # Note that there is a race condition here, so we do a best effort
          # updates here. We reset update_in_steps first so that other workers
          # don't duplicate the updates. Also we update cluster_center_vars
          # before resetting total_counts to avoid large updates to
          # cluster_centers_updated based on partially updated
          # cluster_center_vars.
          with ops.control_dependencies([
              state_ops.assign(update_in_steps,
                               self._mini_batch_steps_per_iteration - 1)
          ]):
            with ops.colocate_with(
                cluster_centers_updated, ignore_existing=True):
              if self._distance_metric == COSINE_DISTANCE:
                cluster_centers = nn_impl.l2_normalize(
                    cluster_centers_updated, dim=1)
              else:
                cluster_centers = cluster_centers_updated
            with ops.colocate_with(cluster_centers_var):
              with ops.control_dependencies(
                  [state_ops.assign(cluster_centers_var, cluster_centers)]):
                with ops.colocate_with(
                    cluster_centers_var, ignore_existing=True):
                  with ops.control_dependencies([
                      state_ops.assign(total_counts,
                                       array_ops.zeros_like(total_counts))
                  ]):
                    return array_ops.identity(update_in_steps)

        return control_flow_ops.cond(
            update_in_steps <= 0, _f,
            lambda: state_ops.assign_sub(update_in_steps, 1))
    else:
      return control_flow_ops.no_op()
Exemple #26
0
    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        # the following equations given in [1]
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, "m")
        m_t = state_ops.assign(m, beta1_t * m + (1. - beta1_t) * grad, use_locking=self._use_locking)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, "v")
        v_t = state_ops.assign(v, beta2_t * v + (1. - beta2_t) * tf.square(grad), use_locking=self._use_locking)
        v_prime = self.get_slot(var, "v_prime")
        v_t_prime = state_ops.assign(v_prime, tf.maximum(v_prime, v_t))

        var_update = state_ops.assign_sub(var,
                                          lr_t * m_t / (tf.sqrt(v_t_prime) + epsilon_t),
                                          use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t, v_t_prime])
Exemple #27
0
  def _apply_dense(self, grad, var):
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
    beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
    if var.dtype.base_dtype == tf.float16:
        eps = 1e-7  # Can't use 1e-8 due to underflow -- not sure if it makes a big difference.
    else:
        eps = 1e-8

    v = self.get_slot(var, "v")
    v_t = v.assign(beta2_t * v + (1. - beta2_t) * tf.square(grad))
    m = self.get_slot(var, "m")
    m_t = m.assign( beta1_t * m + (1. - beta1_t) * grad )
    v_t_hat = tf.div(v_t, 1. - beta2_t)
    m_t_hat = tf.div(m_t, 1. - beta1_t)
    
    g_t = tf.div( m_t, tf.sqrt(v_t)+eps )
    g_t_1 = self.get_slot(var, "g")
    g_t = g_t_1.assign( g_t )

    var_update = state_ops.assign_sub(var, 2. * lr_t * g_t - lr_t * g_t_1) #Adam would be lr_t * g_t
    return control_flow_ops.group(*[var_update, m_t, v_t, g_t])
 def testAssignUpdateNoVarShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, [[2.0, 3.0]])
     self.assertEqual([1, 2], added.get_shape())
     subbed = state_ops.assign_sub(var, [[12.0, 13.0]])
     self.assertEqual([1, 2], subbed.get_shape())
Exemple #29
0
    def _resource_apply_dense(self, grad, var):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            decay_steps = math_ops.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                step <= warmup_steps,
                lr_t * (step / warmup_steps),
                lr_t + decay_rate *
                math_ops.minimum(step - warmup_steps, decay_steps),
            )

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        v = self.get_slot(var, "v")

        if self.clip_gradients:
            clipVal = math_ops.sqrt(
                tf.reduce_sum(v) /
                (1.0 -
                 beta2_power)) * self.clip_multiplier_t + self.clip_epsilon_t
            grad = clip_ops.clip_by_norm(grad, clipVal)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")

        v_t = state_ops.assign(v,
                               beta2_t * v +
                               (1.0 - beta2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)
        v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power)) + epsilon_t
        grad_corr = grad / v_corr_t

        m_t = state_ops.assign(m,
                               beta1_t * m + (1.0 - beta1_t) * grad_corr,
                               use_locking=self._use_locking)
        m_corr_t = m_t / (1.0 - beta1_power)

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t, m_corr_t)

        if var in self.reg_vars:
            if self._initial_weight_decay > 0.0:
                var_t += math_ops.cast(self._weight_decay_t,
                                       var.dtype.base_dtype) * var
            if self._L1_decay > 0.0:
                var_t += math_ops.cast(
                    self._L1_decay, var.dtype.base_dtype) * math_ops.sign(var)

        with tf.control_dependencies([var_t]):
            var_update = state_ops.assign_sub(var,
                                              lr_t * var_t,
                                              use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        return control_flow_ops.group(*updates)
Exemple #30
0
    def _apply_dense(self, grad, var):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        var_update = state_ops.assign_sub(var, lr_t * grad)

        return control_flow_ops.group(*[var_update])
Exemple #31
0
    def _apply_gradient(self, grad, var, indices=None):
        """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
        global_step = self._global_step + 1

        # Update accumulated weighted average of gradients
        gbar = self.get_slot(var, "gbar")
        gbar_decay_t = GetParam(self._gbar_decay, global_step)
        gbar_weight_t = GetParam(self._gbar_weight, global_step)
        if indices is not None:
            # Note - the sparse update is not easily implemented, since the
            # algorithm needs all indices of gbar to be updated
            # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
            # One way to make mat_gbar_decay = 1 is by rescaling.
            # If we want the update:
            #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
            # define:
            #         r_{t+1} = a_{t+1} * r_t
            #         h_t = G_t / r_t
            # Then:
            #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
            # So we get the mat_gbar_decay = 1 as desired.
            # We can implement this in a future version as needed.
            # However we still need gbar_decay = 0, otherwise all indices
            # of the variable will need to be updated.
            if self._gbar_decay != 0.0:
                tf_logging.warning("Not applying momentum for variable: %s" %
                                   var.name)
            gbar_updated = grad
        else:
            gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                                  gbar_decay_t,
                                                  gbar_weight_t * grad)

        # Update the preconditioners and compute the preconditioned gradient
        shape = var.get_shape()
        mat_g_list = []
        for i in range(len(shape)):
            mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
        mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
        mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

        preconditioned_grad = gbar_updated
        v_rank = len(mat_g_list)
        neg_alpha = -GetParam(self._alpha, global_step) / v_rank
        svd_interval = GetParam(self._svd_interval, global_step)
        precond_update_interval = GetParam(self._precond_update_interval,
                                           global_step)
        for i, mat_g in enumerate(mat_g_list):
            # axes is the list of indices to reduce - everything but the current i.
            axes = list(range(i)) + list(range(i + 1, v_rank))
            if shape[i] <= self._max_matrix_size:
                # If the tensor size is sufficiently small perform full Shampoo update
                # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
                # is not strictly correct. However we will use it for now, and
                # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

                # pylint: disable=g-long-lambda,cell-var-from-loop
                mat_g_updated = control_flow_ops.cond(
                    math_ops.mod(global_step, precond_update_interval) < 1,
                    lambda: self._update_mat_g(
                        mat_g, grad, axes, mat_gbar_decay_t, mat_gbar_weight_t
                        * precond_update_interval, i), lambda: mat_g)

                mat_g_updated = mat_g_updated / float(shape[i].value)

                if self._svd_interval == 1:
                    mat_h = self._compute_power(var, mat_g_updated, shape[i],
                                                neg_alpha)
                else:
                    mat_h = control_flow_ops.cond(
                        math_ops.mod(global_step, svd_interval) < 1,
                        lambda: self._compute_power(var, mat_g_updated, shape[
                            i], neg_alpha, "H_" + str(i)),
                        lambda: self.get_slot(var, "H_" + str(i)))

                # mat_h is a square matrix of size d_i x d_i
                # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
                # After contraction with a d_i x d_i tensor
                # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
                # (the first dimension is contracted out, and the second dimension of
                # mat_h is appended).  After going through all the indices, it becomes
                # a d_0 x ... x d_n tensor again.
                preconditioned_grad = math_ops.tensordot(preconditioned_grad,
                                                         mat_h,
                                                         axes=([0], [0]),
                                                         name="precond_" +
                                                         str(i))
            else:
                # Tensor size is too large -- perform diagonal Shampoo update
                # Only normalize non-vector cases.
                if axes:
                    normalizer = 1.0 if indices is not None else float(
                        shape[i].value)
                    grad_outer = math_ops.reduce_sum(grad * grad,
                                                     axis=axes) / normalizer
                else:
                    grad_outer = grad * grad

                if i == 0 and indices is not None:
                    assert self._mat_gbar_decay == 1.0
                    mat_g_updated = state_ops.scatter_add(
                        mat_g, indices, mat_gbar_weight_t * grad_outer)
                    mat_h = math_ops.pow(
                        array_ops.gather(mat_g_updated, indices) +
                        self._epsilon, neg_alpha)
                else:
                    mat_g_updated = self._weighted_average(
                        mat_g, self._mat_gbar_decay, mat_gbar_decay_t,
                        mat_gbar_weight_t * grad_outer)
                    mat_h = math_ops.pow(mat_g_updated + self._epsilon,
                                         neg_alpha)

                # Need to do the transpose to ensure that the tensor becomes
                # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
                preconditioned_grad = array_ops.transpose(
                    preconditioned_grad,
                    perm=list(range(1, v_rank)) + [0]) * mat_h

        # Update the variable based on the Shampoo update
        learning_rate_t = GetParam(self._learning_rate, global_step)
        if indices is not None:
            var_updated = state_ops.scatter_add(
                var, indices, -learning_rate_t * preconditioned_grad)
        else:
            var_updated = state_ops.assign_sub(
                var, learning_rate_t * preconditioned_grad)
        return var_updated
def _zero_debias(unbiased_var, value, decay):
  """Compute the delta required for a debiased Variable.

  All exponential moving averages initialized with Tensors are initialized to 0,
  and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
  similarly biased. This function creates the debias updated amount according to
  a scale factor, as in https://arxiv.org/abs/1412.6980.

  To demonstrate the bias the results from 0-initialization, take an EMA that
  was initialized to `0` with decay `b`. After `t` timesteps of seeing the
  constant `c`, the variable have the following value:

  ```
    EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ...
        = c*(1 - b^t)
  ```

  To have the true value `c`, we would divide by the scale factor `1 - b^t`.

  In order to perform debiasing, we use two shadow variables. One keeps track of
  the biased estimate, and the other keeps track of the number of updates that
  have occurred.

  Args:
    unbiased_var: A Variable representing the current value of the unbiased EMA.
    value: A Tensor representing the most recent value.
    decay: A Tensor representing `1-decay` for the EMA.

  Returns:
    The amount that the unbiased variable should be updated. Computing this
    tensor will also update the shadow variables appropriately.
  """
  with variable_scope.variable_scope(
      unbiased_var.op.name, values=[unbiased_var, value, decay]) as scope:
    with ops.colocate_with(unbiased_var):
      with ops.control_dependencies(None):
        biased_initializer = init_ops.zeros_initializer(
            dtype=unbiased_var.dtype)(unbiased_var.get_shape())
        local_step_initializer = init_ops.ones_initializer()
      biased_var = variable_scope.get_variable(
          "biased", initializer=biased_initializer, trainable=False)
      # Initializing the local_step to `0` would cause problems with the
      # debiasing equation, so we instead initialize to `1`.
      local_step = variable_scope.get_variable(
          "local_step",
          shape=[],
          dtype=unbiased_var.dtype,
          initializer=local_step_initializer,
          trainable=False)

      # Get an update ops for both shadow variables.
      update_biased = state_ops.assign_sub(biased_var,
                                           (biased_var - value) * decay,
                                           name=scope.name)
      update_local_step = local_step.assign_add(1)

      # Compute the value of the delta to update the unbiased EMA. Make sure to
      # use the new values of the biased variable and the local step.
      with ops.control_dependencies([update_biased, update_local_step]):
        # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
        unbiased_ema_delta = (unbiased_var - biased_var.read_value() /
                              (1 - math_ops.pow(
                                  1.0 - decay, local_step.read_value())))

      return unbiased_ema_delta
 def testInitRequiredAssignSub(self):
   with self.cached_session():
     p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
     a = state_ops.assign_sub(p, array_ops.fill([1024, 1024], 0))
     with self.assertRaisesOpError("use uninitialized"):
       a.op.run()
 def testInitRequiredAssignSub(self):
   with self.cached_session():
     p = variables.VariableV1(array_ops.fill([1024, 1024], 1), dtypes.int32)
     a = state_ops.assign_sub(p, array_ops.fill([1024, 1024], 0))
     with self.assertRaisesOpError("use uninitialized"):
       a.op.run()
Exemple #35
0
    def _finish(self, update_ops, name_scope):
        """"""

        caches = [update_op[0] for update_op in update_ops]
        update_ops = [update_op[1:] for update_op in update_ops]
        if self._noise is not None:
            for cache in caches:
                s_t, x_tm1 = cache[:2]
                s_t += random_ops.random_normal(
                    x_tm1.initialized_value().get_shape(), stddev=self._noise)
                cache[0] = s_t

        if self._clip is not None:
            S_t = [cache[0] for cache in caches]
            S_t, _ = clip_ops.clip_by_global_norm(S_t, self._clip)
            for cache, s_t in zip(caches, S_t):
                cache[0] = s_t

        new_update_ops = []
        for cache, update_op in zip(caches, update_ops):
            if len(cache) == 3:
                s_t, x_tm1 = cache[:2]
                with ops.name_scope('update_' + x_tm1.op.name), ops.device(
                        x_tm1.device):
                    x_t = state_ops.assign_sub(x_tm1,
                                               s_t,
                                               use_locking=self._use_locking)
                    cache.append(x_t)
            else:
                s_t_, x_tm1, idxs = cache[:3]
                with ops.name_scope('update_' + x_tm1.op.name), ops.device(
                        x_tm1.device):
                    x_t = state_ops.scatter_sub(x_tm1,
                                                idxs,
                                                s_t_,
                                                use_locking=self._use_locking)
                    cache.append(x_t)
            new_update_ops.append(control_flow_ops.group(*([x_t] + update_op)))

        with ops.control_dependencies(new_update_ops):
            more_update_ops = []
            if self._save_step:
                for cache in caches:
                    if len(cache) == 4:
                        s_t, x_tm1 = cache[:2]
                        s_tm1 = self.get_slot(x_tm1, 's')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            new_step_and_grads = []
                            s_t = state_ops.assign(
                                s_tm1, -s_t, use_locking=self._use_locking)
                    else:
                        s_t_, x_tm1, idxs = cache[:3]
                        s_tm1 = self.get_slot(x_tm1, 's')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            s_t = state_ops.scatter_update(
                                s_tm1,
                                idxs,
                                -s_t_,
                                use_locking=self._use_locking)
                    more_update_ops.append(s_t)
            if self._save_grad:
                for cache in caches:
                    if len(cache) == 4:
                        x_tm1, g_t = cache[1:3]
                        g_tm1 = self.get_slot(x_tm1, 'g')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            new_step_and_grads = []
                            g_t = state_ops.assign(
                                g_tm1, g_t, use_locking=self._use_locking)
                    else:
                        x_tm1, idxs, g_t_ = cache[1:4]
                        g_tm1 = self.get_slot(x_tm1, 'g')
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            g_t = state_ops.scatter_update(
                                g_tm1,
                                idxs,
                                g_t_,
                                use_locking=self._use_locking)
                    more_update_ops.append(g_t)

            if self._chi > 0:
                for cache in caches:
                    if len(cache) == 4:
                        _, x_tm1, _, x_t = cache
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            x_and_t = self._dense_moving_average(
                                x_tm1, x_t, 'x', self._chi)
                            more_update_ops.append(
                                control_flow_ops.group(*x_and_t))
                    else:
                        _, x_tm1, idxs, _, x_t = cache
                        with ops.name_scope('update_' +
                                            x_tm1.op.name), ops.device(
                                                x_tm1.device):
                            x_t_ = array_ops.gather(x_t, idxs)
                            x_and_t = self._sparse_moving_average(
                                x_tm1, idxs, x_t_, 'x', self._chi)
                            more_update_ops.append(
                                control_flow_ops.group(*x_and_t))

        return control_flow_ops.group(*(new_update_ops + more_update_ops),
                                      name=name_scope)
Exemple #36
0
    def _apply_dense(self, grad, var):

        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        g = [self.get_slot(var, "g%d" % i) for i in range(self._keep_num + 1)]

        v = self.get_slot(var, "v")
        z = self.get_slot(var, "z")
        b2p = self.get_slot(var, "b2p")

        if self._pred_g_op == 'none':
            v_t = state_ops.assign(v,
                                   v * beta2_t + tf.square(g[0]) *
                                   (1 - beta2_t),
                                   use_locking=self._use_locking)
        elif self._pred_g_op == 'max':
            v_t = state_ops.assign(
                v,
                v * beta2_t + tf.reduce_max(tf.square(g[0])) * (1 - beta2_t),
                use_locking=self._use_locking)
        elif self._pred_g_op == 'mean':
            v_t = state_ops.assign(
                v,
                v * beta2_t + tf.reduce_mean(tf.square(g[0])) * (1 - beta2_t),
                use_locking=self._use_locking)
        else:
            assert False

        with ops.control_dependencies([v_t]):
            g_t = state_ops.assign(g[-1], grad, use_locking=self._use_locking)
            for i in range(self._keep_num):
                with ops.control_dependencies([g_t]):
                    g_t = state_ops.assign(g[i],
                                           g[i + 1],
                                           use_locking=self._use_locking)

        with ops.control_dependencies([g_t]):
            # m_t = tf.reduce_sum([g[-self._mov_num-1+i]*self.s[i] for i in range(self._mov_num)], axis=0)
            m_t = tf.reduce_sum(
                [g[-i - 2] * self.s[-i - 1] for i in range(self._mov_num)],
                axis=0)
            # m_t = tf.reduce_mean(g[:self._keep_num], axis=0)

        with ops.control_dependencies([v_t]):
            z_t = state_ops.assign(
                z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32))

        b2p_t = state_ops.assign(b2p,
                                 b2p * beta2_t * tf.sign(z_t) +
                                 (1.0 - tf.sign(z_t)),
                                 use_locking=self._use_locking)
        b2_fix = tf.maximum(1e-8, 1.0 - b2p_t)

        step_t = z_t * m_t / (math_ops.sqrt(v_t / b2_fix) + epsilon_t)

        # if var.name == self.first_var.name: #'discriminator/final_linear/w:0':
        #     idx = 0
        #     step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000)
        #     step_t = tf.Print(step_t, [g[i][idx] for i in range(len(g))], 'g', summarize=1000)
        #     step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000)
        #     step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000)
        #     step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000)
        #     step_t = tf.Print(step_t, [m_t[idx]], 'm_t', summarize=1000)
        #     step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000)
        #     step_t = tf.Print(step_t, [step_t], 'step', summarize=1000)

        var_update = state_ops.assign_sub(var,
                                          lr_t * step_t,
                                          use_locking=self._use_locking)
        return control_flow_ops.group(*([var_update]))
Exemple #37
0
            def run_and_check():
                # Assign float32 values
                self.assertAllClose(3.14, self.evaluate(x.assign(v1)))
                self.assertAllClose(3.14 * 2, self.evaluate(x.assign_add(v1)))
                self.assertAllClose(3.14, self.evaluate(x.assign_sub(v1)))

                # Attempt to assign float16 values
                with self.assertRaisesRegexp(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign(v2))
                with self.assertRaisesRegexp(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign_add(v2))
                with self.assertRaisesRegexp(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign_sub(v2))

                # Assign Python floats
                self.assertAllClose(0., self.evaluate(x.assign(0.)))
                self.assertAllClose(3.14, self.evaluate(x.assign(3.14)))
                self.assertAllClose(3.14 * 2,
                                    self.evaluate(x.assign_add(3.14)))
                self.assertAllClose(3.14, self.evaluate(x.assign_sub(3.14)))

                # Assign multiple times
                assign = x.assign(1.)
                self.assertAllClose(1., self.evaluate(assign))
                self.assertAllClose(0., self.evaluate(assign.assign(0.)))
                assign_add = x.assign_add(3.14)
                self.assertAllClose(3.14, self.evaluate(assign_add))
                self.assertAllClose(
                    3.14 * 3,
                    self.evaluate(x.assign_add(3.14).assign_add(3.14)))
                self.assertAllClose(3.14 * 3, x)
                assign_sub = x.assign_sub(3.14)
                self.assertAllClose(3.14 * 2, self.evaluate(assign_sub))
                self.assertAllClose(
                    0., self.evaluate(x.assign_sub(3.14).assign_sub(3.14)))

                # Assign with read_value=False
                self.assertIsNone(self.evaluate(x.assign(1.,
                                                         read_value=False)))
                self.assertAllClose(1., self.evaluate(x))
                self.assertIsNone(
                    self.evaluate(x.assign_add(2., read_value=False)))
                self.assertAllClose(3., self.evaluate(x))
                self.assertIsNone(
                    self.evaluate(x.assign_sub(3., read_value=False)))
                self.assertAllClose(0., self.evaluate(x))

                # Use the tf.assign functions instead of the var.assign methods.
                self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.)))
                self.assertAllClose(3.14,
                                    self.evaluate(state_ops.assign(x, 3.14)))
                self.assertAllClose(
                    3.14 * 2, self.evaluate(state_ops.assign_add(x, 3.14)))
                self.assertAllClose(
                    3.14, self.evaluate(state_ops.assign_sub(x, 3.14)))
Exemple #38
0
 def update_fn(v, value):
     return state_ops.assign_sub(v, (v - value) * decay, name=scope)
Exemple #39
0
    def _resource_apply_sparse(self, grad, var, indices, apply_state=None):

        var_device, var_dtype = var.device, var.dtype.base_dtype
        coefficients = ((apply_state or {}).get((var_device, var_dtype))
                        or self._fallback_apply_state(var_device, var_dtype))
        """
        Adam
        """
        # m_t = beta1 * m + (1 - beta1) * g_t
        m = self.get_slot(var, 'm')
        m_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
        m_t = state_ops.assign(m,
                               m * coefficients['beta_1_t'],
                               use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = self._resource_scatter_add(m, indices, m_scaled_g_values)

        # v_t = beta2 * v + (1 - beta2) * (g_t * g_t)
        v = self.get_slot(var, 'v')
        v_scaled_g_values = (grad * grad) * coefficients['one_minus_beta_2_t']
        v_t = state_ops.assign(v,
                               v * coefficients['beta_2_t'],
                               use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = self._resource_scatter_add(v, indices, v_scaled_g_values)

        denorm = (math_ops.sqrt(v_t) / math_ops.sqrt(
            coefficients['bias_correction2'])) + coefficients['epsilon']
        step_size = coefficients['lr'] / coefficients['bias_correction1']

        if self.nesterov:
            p_scaled_g_values = grad * coefficients['one_minus_beta_1_t']
            perturb = m_t * coefficients['beta_1_t']
            perturb = self._resource_scatter_add(perturb, indices,
                                                 p_scaled_g_values) / denorm

        else:
            perturb = m_t / denorm

        # Projection
        wd_ratio = 1
        if len(var.shape) > 1:
            perturb, wd_ratio = self._projection(var, grad, perturb,
                                                 coefficients['delta'],
                                                 coefficients['wd_ratio'],
                                                 coefficients['epsilon'])

        # Weight decay
        if self.weight_decay > 0:
            var = state_ops.assign(
                var,
                var *
                (1 -
                 coefficients['lr'] * coefficients['weight_decay'] * wd_ratio),
                use_locking=self._use_locking)

        var_update = state_ops.assign_sub(var,
                                          step_size * perturb,
                                          use_locking=self._use_locking)

        return control_flow_ops.group(*[var_update, m_t, v_t])
Exemple #40
0
    def _apply_dense(self, grad, var):

        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")
        g = self.get_slot(var, "g")
        z = self.get_slot(var, "z")
        b1p = self.get_slot(var, "b1p")
        b2p = self.get_slot(var, "b2p")

        m_t = state_ops.assign(m, beta1_t * m + grad * (1 - beta1_t), use_locking=self._use_locking)

        if self._pred_g_op == 'none':
            v_t = state_ops.assign(v, v * beta2_t + tf.square(g) * (1 - beta2_t), use_locking=self._use_locking)
        elif self._pred_g_op == 'max':
            v_t = state_ops.assign(v, v * beta2_t + tf.reduce_max(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking)
        elif self._pred_g_op == 'mean':
            v_t = state_ops.assign(v, v * beta2_t + tf.reduce_mean(tf.square(g)) * (1 - beta2_t), use_locking=self._use_locking)
        else:
            assert False

        # v_t = tf.cond(tf.less(self._current_iter, tf.constant(self._init_step)),
        #               lambda: state_ops.assign(v, v * beta2_t + (grad * grad) * (1 - beta2_t), use_locking=self._use_locking),
        #               lambda: state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking))

        # cond = (tf.sign(tf.cast(self._current_iter - tf.constant(self._init_step), tf.float32) + tf.constant(0.5)) + tf.constant(1.0)) / tf.constant(2.0)
        # v_a = v * beta2_t + (grad * grad) * (1 - beta2_t)
        # v_b = v * beta2_t + (g * g) * (1 - beta2_t)
        # v_t = state_ops.assign(v, v_a * (1 - cond) + v_b * cond, use_locking=self._use_locking)

        # cond = tf.abs(tf.sign(g))
        # v_t = state_ops.assign(v, v * (1 - cond) + (v * beta2_t + (g * g) * (1 - beta2_t)) * cond, use_locking=self._use_locking)

        # v_t = state_ops.assign(v, v * beta2_t + (g * g) * (1 - beta2_t), use_locking=self._use_locking)
        # v_t = state_ops.assign(v, tf.maximum(grad * grad * beta2_fix, v * beta2_t + (g * g) * (1 - beta2_t)), use_locking=self._use_locking)

        with ops.control_dependencies([v_t]):
            z_t = state_ops.assign(z, tf.cast(tf.logical_or(v_t > 0.0, z > 0.0), tf.float32))
            g_t = state_ops.assign(g, grad, use_locking=self._use_locking)

        b1p_t = state_ops.assign(b1p, b1p * beta1_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking)
        b2p_t = state_ops.assign(b2p, b2p * beta2_t * tf.sign(z_t) + (1.0 - tf.sign(z_t)), use_locking=self._use_locking)

        b1_fix = tf.maximum(1e-8, 1.0 - b1p_t)
        b2_fix = tf.maximum(1e-8, 1.0 - b2p_t)

        step_t = z_t * (m_t / b1_fix) / (math_ops.sqrt(v_t / b2_fix) + epsilon_t)

        # if var.name == self.first_var.name: #'discriminator/final_linear/w:0':
        #     idx = 0
        #     step_t = tf.Print(step_t, [z_t[idx]], 'z_t', summarize=1000)
        #     step_t = tf.Print(step_t, [g[idx]], 'g', summarize=1000)
        #     step_t = tf.Print(step_t, [grad[idx]], 'grad', summarize=1000)
        #     step_t = tf.Print(step_t, [b2p_t[idx]], 'b2p_t', summarize=1000)
        #     step_t = tf.Print(step_t, [b2_fix], 'beta2_fix', summarize=1000)
        #     step_t = tf.Print(step_t, [tf.sqrt(v_t / b2_fix)[idx]], 'v_t', summarize=1000)
        #     step_t = tf.Print(step_t, [step_t], 'step', summarize=1000)

        var_update = state_ops.assign_sub(var, lr_t * step_t, use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, g_t])
Exemple #41
0
    def _resource_apply_dense(self, grad, var):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            lr_t = tf.where(
                step <= warmup_steps, lr_t * (step / warmup_steps),
                min_lr + (lr_t - min_lr) *
                (1.0 - math_ops.minimum(step, total_steps) / total_steps))

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")
        m_t = state_ops.assign(m,
                               beta1_t * m + (1.0 - beta1_t) * grad,
                               use_locking=self._use_locking)
        m_corr_t = m_t / (1.0 - beta1_power)

        v = self.get_slot(var, "v")
        v_t = state_ops.assign(v,
                               beta2_t * v +
                               (1.0 - beta2_t) * math_ops.square(grad),
                               use_locking=self._use_locking)
        if self._amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power) + epsilon_t)
        else:
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power) + epsilon_t)

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.where(sma_t > 5.0, r_t * m_corr_t / v_corr_t, m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += math_ops.cast(self._weight_decay_t,
                                   var.dtype.base_dtype) * var

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self._amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)
def sub(ref1, value1):
    return state_ops.assign_sub(ref1, value1)
 def testAssignUpdateNoValueShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], added.get_shape())
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual([1, 2], subbed.get_shape())
 def testAssignUpdateNoShape(self):
     var = state_ops.variable_op([1, 2], dtypes.float32, set_shape=False)
     added = state_ops.assign_add(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), added.get_shape())
     subbed = state_ops.assign_sub(var, self._NewShapelessTensor())
     self.assertEqual(tensor_shape.unknown_shape(), subbed.get_shape())
Exemple #45
0
    def _apply_dense(self, grad, var):

        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        m = self.get_slot(var, "m")
        v = self.get_slot(var, "v")
        g = self.get_slot(var, "g")
        b2p = self.get_slot(var, "b2p")

        cond1 = tf.abs(tf.sign(grad))
        m_t = state_ops.assign(m,
                               (beta1_t * m + grad *
                                (1.0 - beta1_t)) * cond1 + m * (1.0 - cond1),
                               use_locking=self._use_locking)

        # g_square = tf.square(g)
        # def mean(g_square):
        #     return (tf.reduce_sum(g_square) - g_square) / (tf.reduce_prod(tf.shape(g_square))-1.0)
        #
        # def max(g_square):
        #     max_g_square = tf.reduce_max(g_square)
        #     cond = (g_square == max_g_square)
        #     max1_g_square = tf.reduce_max(g_square - cond * g_square)
        #     max_g_square = max_g_square * (1.0 - cond) + max1_g_square * cond
        #     return max_g_square
        #
        # gs = max(g_square)
        # gs = mean(g_square)

        gs = tf.maximum(tf.reduce_mean(tf.square(g)), tf.square(g))

        cond2 = tf.abs(tf.sign(gs))
        v_t = state_ops.assign(v,
                               (v * beta2_t + gs *
                                (1.0 - beta2_t)) * cond2 + v * (1.0 - cond2),
                               use_locking=self._use_locking)
        b2p_t = state_ops.assign(b2p,
                                 b2p * beta2_t * cond2 + (1.0 - cond2),
                                 use_locking=self._use_locking)
        b2_fix = tf.maximum(1.0 - self._beta2, 1.0 - b2p_t)

        with ops.control_dependencies([v_t]):
            g_t = state_ops.assign(g, grad, use_locking=self._use_locking)

        step_t = m_t / (math_ops.sqrt(v_t / b2_fix) +
                        epsilon_t) * cond2 * cond1

        if 'discriminator67345715' in var.name:
            step_t = tf.Print(step_t, [cond1[0]], var.name + ' cond1:')
            step_t = tf.Print(step_t, [cond2[0]], var.name + ' cond2:')
            step_t = tf.Print(step_t, [b2_fix[0]], var.name + ' b2_fix:')
            step_t = tf.Print(step_t, [grad[0]], var.name + ' grad:')

            step_t = tf.Print(step_t, [m_t[0]], var.name + ' m_t:')
            step_t = tf.Print(step_t, [math_ops.sqrt(v_t / b2_fix)[0]],
                              var.name + ' v_t_fix:')
            step_t = tf.Print(step_t, [step_t[0]], var.name + ' step_t:')
            step_t = tf.Print(step_t, [tf.reduce_max(step_t)],
                              var.name + ' max_step_t:')

        var_update = state_ops.assign_sub(var,
                                          lr_t * step_t,
                                          use_locking=self._use_locking)
        return control_flow_ops.group(*[var_update, g_t])
Exemple #46
0
    def _finish(self, state):
        var_dtype = self._variables[0].dtype.base_dtype
        # Update global step.
        global_step = self._get_global_step(state)
        update_global_step = state_ops.assign_add(global_step, 1.)

        # Update the first moment estimate.
        beta1 = state.get_hyper("beta1", dtype=var_dtype)
        moment1 = self._get_moment1(state)
        flat_grad = self._get_flat_grad(state)
        # moment1_t := beta1 * moment1_{t-1} + (1 - beta1) * flat_grad_t
        update_moment1 = moment1.assign(beta1 * moment1 +
                                        (1. - beta1) * flat_grad)

        # Update the gradient buffer.
        window = state.get_hyper("window")
        grad_buffer = self._get_grad_buffer(state)
        next_grad_index = math_ops.floormod(
            math_ops.to_int32(update_global_step - 1.), window)
        # grad_buffer[(t-1) % window] := moment1_t
        update_grad_buffer = state_ops.scatter_update(grad_buffer,
                                                      next_grad_index,
                                                      update_moment1)

        # Compute the update step.
        eps = state.get_hyper("eps", dtype=var_dtype)
        svd_eps = state.get_hyper("svd_eps", dtype=var_dtype)
        sigma_eps = state.get_hyper("sigma_eps", dtype=var_dtype)
        lr = state.get_hyper("lr", dtype=var_dtype)
        denom = math_ops.sqrt(
            math_ops.minimum(
                ops.convert_to_tensor(update_global_step),
                ops.convert_to_tensor(math_ops.cast(window, dtype=var_dtype))))
        moment1_2d = array_ops.expand_dims(update_moment1, -1)

        # m = grad_buffer^T / sqrt(min(t, window))
        # m has shape [model dimension, window], where model dimension is the sum
        # of the dimensions of the flattened variables.
        m = array_ops.transpose(math_ops.divide(update_grad_buffer, denom))

        # sigma, u, _ = SVD(m^Tm + I * svd_eps)
        mm = math_ops.matmul(m, m, transpose_a=True)
        damping = math_ops.cast(linalg_ops.eye(window),
                                dtype=var_dtype) * svd_eps
        sigma, u, _ = linalg_ops.svd(mm + damping)
        sigma_sqrt = math_ops.sqrt(sigma)
        sigma_sqrt_min = math_ops.reduce_min(sigma_sqrt)

        # sigma_sqrt_inv = 1 / (\sqrt{sigma} + sigma_eps) ^ 3
        # We add sigma_eps to alleviate numerical instability.
        # Note that (m^Tm)^(-3/2) = u diag(sigma_sqrt_inv) u^T.
        sigma_sqrt_inv = math_ops.divide(
            math_ops.cast(1.0, dtype=var_dtype),
            math_ops.pow(sigma_sqrt + sigma_eps, 3))

        # In full matrix AdaGrad, the update step computes (mm^T)^(-1/2)g, where the
        # inversion of a model dimension by model dimension matrix is needed. To
        # speed up this computation we calculate the following instead:
        # m(m^Tm)^(-3/2)m^T moment1 = m u diag(sigma_sqrt_inv) u^T m^T moment1.
        new_step = array_ops.expand_dims(
            array_ops.zeros(flat_grad.get_shape(), dtype=var_dtype), -1)
        head = math_ops.matmul(
            m,
            math_ops.matmul(
                u,
                math_ops.matmul(
                    array_ops.diag(sigma_sqrt_inv),
                    math_ops.matmul(u,
                                    math_ops.matmul(m,
                                                    moment1_2d,
                                                    transpose_a=True),
                                    transpose_a=True))))

        # When inverting (mm^t)^(1/2), we also add epsilon * I regularization for
        # degenerate cases. We expand ((mm^t)^(1/2) + epsilon * I)^(-1) using
        # Woodbury's identity.
        # For full derivation please see paper at
        # https://arxiv.org/pdf/1806.02958.pdf
        tail = moment1_2d - math_ops.matmul(
            m,
            math_ops.matmul(
                u,
                math_ops.matmul(
                    array_ops.diag(
                        math_ops.divide(math_ops.cast(1.0, dtype=var_dtype),
                                        sigma)),
                    math_ops.matmul(u,
                                    math_ops.matmul(
                                        m, moment1_2d, transpose_a=True),
                                    transpose_a=True))))
        scaled_tail = math_ops.divide(tail, sigma_sqrt_min)

        update_new_step = control_flow_ops.cond(
            sigma_sqrt_min > eps, lambda: math_ops.add(head, scaled_tail),
            lambda: math_ops.add(new_step, head))

        # Update each variable.
        update_step = []
        for var in self._variables:
            dim = self.shape_dict[var.name]
            start_index = self.index_dict[var.name]
            end_index = start_index + dim
            var_update_correct_shape = array_ops.reshape(
                update_new_step[start_index:end_index], var.get_shape())
            var_updated = state_ops.assign_sub(var,
                                               lr * var_update_correct_shape)
            update_step.append(var_updated)

        return control_flow_ops.group(update_step)
 def update_loss_scale(self, if_finite_grads):
   return control_flow_ops.cond(
       if_finite_grads, lambda: state_ops.assign_add(self._loss_scale, 1),
       lambda: state_ops.assign_sub(self._loss_scale, 1))
Exemple #48
0
class INNAOptimizer(optimizer.Optimizer):
  """Optimizer that implements the INNA algorithm.
  See [Castera et al., 2019](https://arxiv.org/abs/1905.12278).
  """

  def __init__(self,
                 lr=0.01,
                 alpha=0.5,
                 beta=0.1,
                 decay=1.,
                 decaypower = 0.5,
                 speed_ini=1.0,
                 epsilon=1e-8,
                 use_locking=False,
                 name="INNA"):
   
    super(INNAOptimizer, self).__init__(use_locking,name)
    self._iterations = 0
    self._lr = lr
    self._alpha = alpha
    self._beta = beta
    self._epsilon = epsilon
    self._decay = decay
    self._decaypower = decaypower
    self._speed_ini = speed_ini

    # Tensor versions of the constructor arguments, created in _prepare().
    self._lr_t = None
    self._alpha_t = None
    self._beta_t = None
    self._epsilon_t = None
    self._decay_t = None
    self._decaypower_t = None
    self._speed_ini_t = None


  def _create_slots(self, var_list):
    # Create slots for the auxiliary variable.
    for v in var_list:
      self._zeros_slot(v, "v1", self._name)

  def _prepare(self):
    lr = self._call_if_callable(self._lr)
    alpha = self._call_if_callable(self._alpha)
    beta = self._call_if_callable(self._beta)
    epsilon = self._call_if_callable(self._epsilon)
    decay = self._call_if_callable(self._decay)
    decaypower = self._call_if_callable(self._decaypower)
    speed_ini = self._call_if_callable(self._speed_ini)
    

    self._lr_t = ops.convert_to_tensor(self._lr, name="lr")
    self._alpha_t = ops.convert_to_tensor(self._alpha, name="alpha")
    self._beta_t = ops.convert_to_tensor(self._beta, name="beta")
    self._epsilon_t = ops.convert_to_tensor(self._epsilon, name="epsilon")
    self._decay_t = ops.convert_to_tensor(self._decay, name="decay")
    self._decaypower_t = ops.convert_to_tensor(self._decaypower, name="decaypower")
    self._speed_ini_t = ops.convert_to_tensor(self._speed_ini, name="speed_ini")

  def _apply_dense(self, grad, var):
    lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
    alpha_t = math_ops.cast(self._alpha_t, var.dtype.base_dtype)
    beta_t = math_ops.cast(self._beta_t, var.dtype.base_dtype)
    epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)
    decay_t = math_ops.cast(self._decay_t, var.dtype.base_dtype)
    decaypower_t = math_ops.cast(self._decaypower_t, var.dtype.base_dtype)
    speed_ini_t = math_ops.cast(self._speed_ini_t, var.dtype.base_dtype)
    

    v = self.get_slot(var, "v1")
    #(1.-self.alpha*self.beta)*p )
    #Initialise v such that the initial speed is in the direction of -grad
    v_temp = cond( equal(num_iter(),0) ,
      lambda : (1.-alpha_t*beta_t) * var - beta_t**2 * grad + beta_t * speed_ini_t * grad, lambda : v )
    '''
    if k == 0:
        v_temp = (1.-alpha_t*beta_t) * var - beta_t**2 * grad + beta_t * speed_ini_t * grad
    else: 
        v_temp = v
    '''

    v_t = v.assign( v_temp - ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp ) )
   
   '''
   # ψ_kp1 = ψ_k + γk ( (1/β - α) θ_k - 1/β ψ_k )
   # ψ_kp1 = ψ_k - γk ( (α - 1/β) θ_k + 1/β ψ_k )
   
    v = v_temp - ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp ) 
    
    '''
    var_update = state_ops.assign_sub( var, ( lr_t * decay_t / math_ops.pow(math_ops.cast(num_iter()+1, var.dtype.base_dtype),decaypower_t) ) * ( (alpha_t-1./beta_t) * var + 1./beta_t * v_temp + beta_t * grad ) ) #Update 'ref' by subtracting 'value
Exemple #49
0
 def _resource_apply_dense(self, grad, var, apply_state=None):
     var_device, var_dtype = var.device, var.dtype.base_dtype
     coefficients = ((apply_state or {}).get((var_device, var_dtype))
                     or self._fallback_apply_state(var_device, var_dtype))
     non_zero = partial(self._non_zero,
                        epsilon=coefficients['epsilon'],
                        use_softplus=self.use_softplus,
                        beta_softplus=coefficients['beta_softplus'])
     # prepares gradient
     grad = self._gradient_clipping(grad, var, non_zero,
                                    coefficients['eps_clipping'],
                                    coefficients['threshold_clipping'])
     grad = self._gradient_normalization(grad, non_zero,
                                         self.centralize_gradients,
                                         self.normalize_gradients)
     # first moment estimation
     # using positive-negative momentum and bias correction
     prev_m = self.get_slot(var, 'prev_m')
     m = self.get_slot(var, 'm')
     m_scaled_g_values = grad * coefficients['one_minus_beta1_squared_t']
     prev_m_values = coefficients['beta1_squared'] * prev_m
     prev_m_t = state_ops.assign(prev_m, m, use_locking=self._use_locking)
     m_beta = coefficients['beta_3_t'] * m
     m_t = state_ops.assign(m,
                            prev_m_values + m_scaled_g_values,
                            use_locking=self._use_locking)
     m_ema = coefficients['one_plus_beta_3_t'] * m_t - m_beta
     m_ema_corr = m_ema / coefficients['one_minus_beta_1_power']
     # second moment estimation
     # using positive-negative momentum and bias correction
     v = self.get_slot(var, 'v')
     v_scaled_g_values = math_ops.square(
         grad) * coefficients['one_minus_beta_2_t']
     v_t = state_ops.assign(v,
                            v * coefficients['beta_2_t'] +
                            v_scaled_g_values,
                            use_locking=self._use_locking)
     v_hat = self.get_slot(var, 'vhat')
     v_hat_t = math_ops.maximum(v_hat, v_t)
     with ops.control_dependencies([v_hat_t]):
         v_hat_t = state_ops.assign(v_hat,
                                    v_hat_t,
                                    use_locking=self._use_locking)
     v_ema_hat_corr = v_hat_t / coefficients['one_minus_beta_2_power']
     # update vector
     # takes positive negative momentum into account
     denom = coefficients['pnm_noise_amplitude'] * math_ops.sqrt(
         v_ema_hat_corr)
     update = m_ema_corr / non_zero(denom)
     # weight decay
     # combining norm-loss and stable weight decay
     euclidian_norm = self._axis_aware_euclidian_norm(
         var)  # for norm-loss regularization
     effective_stepsize_inv = math_ops.sqrt(
         math_ops.reduce_mean(v_ema_hat_corr))  # for stable weight decay
     scaled_weight_decay = coefficients['weight_decay'] * (
         euclidian_norm - 1.) / non_zero(
             euclidian_norm * effective_stepsize_inv)
     update += scaled_weight_decay * var
     # applies update
     var_update = state_ops.assign_sub(
         var,
         update * coefficients['scheduled_learning_rate'],
         use_locking=self._use_locking)
     updates = [prev_m_t, m_t, v_t, v_hat_t, var_update]
     train_op = control_flow_ops.group(*updates)
     look_ahead_op = self._look_ahead(coefficients, train_op, var)
     return control_flow_ops.group(train_op, look_ahead_op)
Exemple #50
0
def assign_moving_mean_variance(
    mean_var, variance_var, value, decay, name=None):
  """Compute exponentially weighted moving {mean,variance} of a streaming value.

  The `value` updated exponentially weighted moving `mean_var` and
  `variance_var` are given by the following recurrence relations:

  ```python
  variance_var = decay * (variance_var + (1-decay) * (value - mean_var)**2)
  mean_var     = decay * mean_var + (1 - decay) * value
  ```

  Note: `mean_var` is updated *after* `variance_var`, i.e., `variance_var` uses
  the lag-1 mean.

  For derivation justification, see equation 143 of:
    T. Finch, Feb 2009. "Incremental calculation of weighted mean and variance".
    http://people.ds.cam.ac.uk/fanf2/hermes/doc/antiforgery/stats.pdf

  Args:
    mean_var: `float`-like `Variable` representing the exponentially weighted
      moving mean. Same shape as `variance_var` and `value`.
    variance_var: `float`-like `Variable` representing the
      exponentially weighted moving variance. Same shape as `mean_var` and
      `value`.
    value: `float`-like `Tensor`. Same shape as `mean_var` and `variance_var`.
    decay: A `float`-like `Tensor`. The moving mean decay. Typically close to
      `1.`, e.g., `0.999`.
    name: Optional name of the returned operation.

  Returns:
    mean_var: `Variable` representing the `value`-updated exponentially weighted
      moving mean.
    variance_var: `Variable` representing the `value`-updated
      exponentially weighted moving variance.

  Raises:
    TypeError: if `mean_var` does not have float type `dtype`.
    TypeError: if `mean_var`, `variance_var`, `value`, `decay` have different
      `base_dtype`.
  """
  with ops.name_scope(name, "assign_moving_mean_variance",
                      [variance_var, mean_var, value, decay]):
    with ops.colocate_with(variance_var):
      with ops.colocate_with(mean_var):
        base_dtype = mean_var.dtype.base_dtype
        if not base_dtype.is_floating:
          raise TypeError(
              "mean_var.base_dtype({}) does not have float type "
              "`dtype`.".format(base_dtype.name))
        if base_dtype != variance_var.dtype.base_dtype:
          raise TypeError(
              "mean_var.base_dtype({}) != variance_var.base_dtype({})".format(
                  base_dtype.name,
                  variance_var.dtype.base_dtype.name))
        value = ops.convert_to_tensor(value, dtype=base_dtype, name="value")
        decay = ops.convert_to_tensor(decay, dtype=base_dtype, name="decay")
        delta = value - mean_var
        with ops.control_dependencies([delta]):
          mean_var = state_ops.assign_add(
              mean_var,
              (1. - decay) * delta)
          variance_var = state_ops.assign_sub(
              variance_var,
              (1. - decay) * (variance_var - decay * math_ops.square(delta)))
        return mean_var, variance_var
Exemple #51
0
                               use_locking=self._use_locking)
    # gn_c = ((d/dy) * dLdy) * dydvar ** 2
    # gn_t = beta2 * gn + (1 - beta2) * (gn_c)
    dLdy = tf.gradients(self.loss_t, self.pred_t)
		sec_loss = tf.gradients(dLdy, self.pred_t)
		dydvar = tf.gradients(self.pred_t, var)
		sec_loss_t = math_ops.cast(sec_loss, var.dtype.base_dtype)
		dydvar_t = tf.gradients(dydvar, var)
		gn_c = sec_loss * dydvar * dydvar
    gn = self.get_slot(var, "gn")
    gn_scaled_g_values = (gn_c) * (1 - beta2_t)
    gn_t = state_ops.assign(gn, gn * beta2_t, use_locking=self._use_locking)
    gn_t = state_ops.scatter_add(gn_t, grad.indices, gn_scaled_g_values,
                               use_locking=self._use_locking)
    var_update = state_ops.assign_sub(var,
                                      lr * m_t / (gn_t + epsilon_t),
                                      use_locking=self._use_locking)
    return control_flow_ops.group(*[var_update, m_t, v_t])












Exemple #52
0
def _zero_debias(unbiased_var, value, decay):
    """Compute the delta required for a debiased Variable.

  All exponential moving averages initialized with Tensors are initialized to 0,
  and therefore are biased to 0. Variables initialized to 0 and used as EMAs are
  similarly biased. This function creates the debias updated amount according to
  a scale factor, as in https://arxiv.org/abs/1412.6980.

  To demonstrate the bias the results from 0-initialization, take an EMA that
  was initialized to `0` with decay `b`. After `t` timesteps of seeing the
  constant `c`, the variable have the following value:

  ```
    EMA = 0*b^(t) + c*(1 - b)*b^(t-1) + c*(1 - b)*b^(t-2) + ...
        = c*(1 - b^t)
  ```

  To have the true value `c`, we would divide by the scale factor `1 - b^t`.

  In order to perform debiasing, we use two shadow variables. One keeps track of
  the biased estimate, and the other keeps track of the number of updates that
  have occurred.

  Args:
    unbiased_var: A Variable representing the current value of the unbiased EMA.
    value: A Tensor representing the most recent value.
    decay: A Tensor representing `1-decay` for the EMA.

  Returns:
    The amount that the unbiased variable should be updated. Computing this
    tensor will also update the shadow variables appropriately.
  """
    with variable_scope.variable_scope(unbiased_var.op.name,
                                       values=[unbiased_var, value,
                                               decay]) as scope:
        with ops.colocate_with(unbiased_var):
            with ops.init_scope():
                biased_initializer = init_ops.zeros_initializer(
                    dtype=unbiased_var.dtype)(unbiased_var.get_shape())
                local_step_initializer = init_ops.zeros_initializer()

            def _maybe_get_unique(name):
                """Get name for a unique variable, if not `reuse=True`."""
                if variable_scope.get_variable_scope().reuse:
                    return name
                vs_vars = [
                    x.op.name for x in
                    variable_scope.get_variable_scope().global_variables()
                ]
                full_name = variable_scope.get_variable_scope(
                ).name + "/" + name
                if full_name not in vs_vars: return name
                idx = 1
                while full_name + ("_%d" % idx) in vs_vars:
                    idx += 1
                return name + ("_%d" % idx)

            biased_var = variable_scope.get_variable(
                _maybe_get_unique("biased"),
                initializer=biased_initializer,
                trainable=False)
            local_step = variable_scope.get_variable(
                _maybe_get_unique("local_step"),
                shape=[],
                dtype=unbiased_var.dtype,
                initializer=local_step_initializer,
                trainable=False)

            # Get an update ops for both shadow variables.
            update_biased = state_ops.assign_sub(biased_var,
                                                 (biased_var - value) * decay,
                                                 name=scope.name)
            update_local_step = local_step.assign_add(1)

            # Compute the value of the delta to update the unbiased EMA. Make sure to
            # use the new values of the biased variable and the local step.
            with ops.control_dependencies([update_biased, update_local_step]):
                # This function gets `1 - decay`, so use `1.0 - decay` in the exponent.
                unbiased_ema_delta = (
                    unbiased_var - biased_var.read_value() /
                    (1 - math_ops.pow(1.0 - decay, local_step.read_value())))

            return unbiased_ema_delta
            def run_and_check():
                # Assign float32 values
                self.assertAllClose(3., self.evaluate(x.assign(v1)))
                self.assertAllClose(3. * 2, self.evaluate(x.assign_add(v1)))
                self.assertAllClose(3., self.evaluate(x.assign_sub(v1)))

                # Attempt to assign float16 values
                with self.assertRaisesRegex(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign(v2))
                with self.assertRaisesRegex(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign_add(v2))
                with self.assertRaisesRegex(
                        ValueError,
                        'conversion requested dtype float32 for Tensor with dtype float16'
                ):
                    self.evaluate(x.assign_sub(v2))

                # Assign Python floats
                self.assertAllClose(0., self.evaluate(x.assign(0.)))
                self.assertAllClose(3., self.evaluate(x.assign(3.)))
                self.assertAllClose(3. * 2, self.evaluate(x.assign_add(3.)))
                self.assertAllClose(3., self.evaluate(x.assign_sub(3.)))

                # Assign multiple times
                # This currently only works if no strategy is used
                if not ds_context.has_strategy():
                    assign = x.assign(1.)
                    self.assertAllClose(1., self.evaluate(assign))
                    self.assertAllClose(0., self.evaluate(assign.assign(0.)))
                    assign_add = x.assign_add(3.)
                    self.assertAllClose(3., self.evaluate(assign_add))
                    self.assertAllClose(
                        3. * 3, self.evaluate(x.assign_add(3.).assign_add(3.)))
                    self.assertAllClose(3. * 3, x)
                    assign_sub = x.assign_sub(3.)
                    self.assertAllClose(3. * 2, self.evaluate(assign_sub))
                    self.assertAllClose(
                        0., self.evaluate(x.assign_sub(3.).assign_sub(3.)))

                # Assign with read_value=False
                self.assertIsNone(self.evaluate(x.assign(1.,
                                                         read_value=False)))
                self.assertAllClose(1., self.evaluate(x))
                self.assertIsNone(
                    self.evaluate(x.assign_add(2., read_value=False)))
                self.assertAllClose(3., self.evaluate(x))
                self.assertIsNone(
                    self.evaluate(x.assign_sub(3., read_value=False)))
                self.assertAllClose(0., self.evaluate(x))

                # Use the tf.assign functions instead of the var.assign methods.
                self.assertAllClose(0., self.evaluate(state_ops.assign(x, 0.)))
                self.assertAllClose(3., self.evaluate(state_ops.assign(x, 3.)))
                self.assertAllClose(3. * 2,
                                    self.evaluate(state_ops.assign_add(x, 3.)))
                self.assertAllClose(3.,
                                    self.evaluate(state_ops.assign_sub(x, 3.)))
Exemple #54
0
    def _apply_sparse_shared(self, grad, var, indices, scatter_add):
        step, beta1_power, beta2_power = self._get_beta_accumulators()
        beta1_power = math_ops.cast(beta1_power, var.dtype.base_dtype)
        beta2_power = math_ops.cast(beta2_power, var.dtype.base_dtype)
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)

        if self._initial_total_steps > 0:
            total_steps = math_ops.cast(self._total_steps_t,
                                        var.dtype.base_dtype)
            warmup_proportion = math_ops.cast(self._warmup_proportion_t,
                                              var.dtype.base_dtype)
            min_lr = math_ops.cast(self._min_lr_t, var.dtype.base_dtype)
            warmup_steps = total_steps * warmup_proportion
            decay_steps = math_ops.maximum(total_steps - warmup_steps, 1)
            decay_rate = (min_lr - lr_t) / decay_steps
            lr_t = tf.where(
                step <= warmup_steps,
                lr_t * (step / warmup_steps),
                lr_t + decay_rate *
                math_ops.minimum(step - warmup_steps, decay_steps),
            )

        beta1_t = math_ops.cast(self._beta1_t, var.dtype.base_dtype)
        beta2_t = math_ops.cast(self._beta2_t, var.dtype.base_dtype)
        epsilon_t = math_ops.cast(self._epsilon_t, var.dtype.base_dtype)

        sma_inf = 2.0 / (1.0 - beta2_t) - 1.0
        sma_t = sma_inf - 2.0 * step * beta2_power / (1.0 - beta2_power)

        m = self.get_slot(var, "m")
        m_scaled_g_values = grad * (1 - beta1_t)
        m_t = state_ops.assign(m, m * beta1_t, use_locking=self._use_locking)
        with ops.control_dependencies([m_t]):
            m_t = scatter_add(m, indices, m_scaled_g_values)
        m_corr_t = m_t / (1.0 - beta1_power)

        v = self.get_slot(var, "v")
        v_scaled_g_values = (grad * grad) * (1 - beta2_t)
        v_t = state_ops.assign(v, v * beta2_t, use_locking=self._use_locking)
        with ops.control_dependencies([v_t]):
            v_t = scatter_add(v, indices, v_scaled_g_values)
        if self._amsgrad:
            vhat = self.get_slot(var, 'vhat')
            vhat_t = state_ops.assign(vhat,
                                      math_ops.maximum(vhat, v_t),
                                      use_locking=self._use_locking)
            v_corr_t = math_ops.sqrt(vhat_t / (1.0 - beta2_power))
        else:
            v_corr_t = math_ops.sqrt(v_t / (1.0 - beta2_power))

        r_t = math_ops.sqrt((sma_t - 4.0) / (sma_inf - 4.0) * (sma_t - 2.0) /
                            (sma_inf - 2.0) * sma_inf / sma_t)

        var_t = tf.where(sma_t >= 5.0, r_t * m_corr_t / (v_corr_t + epsilon_t),
                         m_corr_t)

        if self._initial_weight_decay > 0.0:
            var_t += math_ops.cast(self._weight_decay_t,
                                   var.dtype.base_dtype) * var

        var_update = state_ops.assign_sub(var,
                                          lr_t * var_t,
                                          use_locking=self._use_locking)

        updates = [var_update, m_t, v_t]
        if self._amsgrad:
            updates.append(vhat_t)
        return control_flow_ops.group(*updates)
Exemple #55
0
 def update_fn(v, update_delta):
   return state_ops.assign_sub(v, update_delta, name=scope)
Exemple #56
0
  def _apply_gradient(self, grad, var, indices=None):
    """The main function to update a variable.

    Args:
      grad: A Tensor containing gradient to apply.
      var: A Tensor containing the variable to update.
      indices: An array of integers, for sparse update.

    Returns:
      Updated variable var = var - learning_rate * preconditioner * grad

    If the gradient is dense, var and grad have the same shape.
    If the update is sparse, then the first dimension of the gradient and var
    may differ, others are all the same. In this case the indices array
    provides the set of indices of the variable which are to be updated with
    each row of the gradient.
    """
    global_step = self._global_step + 1

    # Update accumulated weighted average of gradients
    gbar = self.get_slot(var, "gbar")
    gbar_decay_t = GetParam(self._gbar_decay, global_step)
    gbar_weight_t = GetParam(self._gbar_weight, global_step)
    if indices is not None:
      # Note - the sparse update is not easily implemented, since the
      # algorithm needs all indices of gbar to be updated
      # if mat_gbar_decay != 1 or mat_gbar_decay != 0.
      # One way to make mat_gbar_decay = 1 is by rescaling.
      # If we want the update:
      #         G_{t+1} = a_{t+1} G_t + b_{t+1} w_t
      # define:
      #         r_{t+1} = a_{t+1} * r_t
      #         h_t = G_t / r_t
      # Then:
      #         h_{t+1} = h_t + (b_{t+1} / r_{t+1}) * w_t
      # So we get the mat_gbar_decay = 1 as desired.
      # We can implement this in a future version as needed.
      # However we still need gbar_decay = 0, otherwise all indices
      # of the variable will need to be updated.
      if self._gbar_decay != 0.0:
        tf_logging.warning("Not applying momentum for variable: %s" % var.name)
      gbar_updated = grad
    else:
      gbar_updated = self._weighted_average(gbar, self._gbar_decay,
                                            gbar_decay_t,
                                            gbar_weight_t * grad)

    # Update the preconditioners and compute the preconditioned gradient
    shape = var.get_shape()
    mat_g_list = []
    for i in range(len(shape)):
      mat_g_list.append(self.get_slot(var, "Gbar_" + str(i)))
    mat_gbar_decay_t = GetParam(self._mat_gbar_decay, global_step)
    mat_gbar_weight_t = GetParam(self._mat_gbar_weight, global_step)

    preconditioned_grad = gbar_updated
    v_rank = len(mat_g_list)
    neg_alpha = - GetParam(self._alpha, global_step) / v_rank
    svd_interval = GetParam(self._svd_interval, global_step)
    precond_update_interval = GetParam(self._precond_update_interval,
                                       global_step)
    for i, mat_g in enumerate(mat_g_list):
      # axes is the list of indices to reduce - everything but the current i.
      axes = list(range(i)) + list(range(i+1, v_rank))
      if shape[i] <= self._max_matrix_size:
        # If the tensor size is sufficiently small perform full Shampoo update
        # Note if precond_update_interval > 1 and mat_gbar_decay_t != 1, this
        # is not strictly correct. However we will use it for now, and
        # fix if needed. (G_1 = aG + bg ==> G_n = a^n G + (1+a+..+a^{n-1})bg)

        # pylint: disable=g-long-lambda,cell-var-from-loop
        mat_g_updated = control_flow_ops.cond(
            math_ops.mod(global_step, precond_update_interval) < 1,
            lambda: self._update_mat_g(
                mat_g, grad, axes, mat_gbar_decay_t,
                mat_gbar_weight_t * precond_update_interval, i),
            lambda: mat_g)

        if self._svd_interval == 1:
          mat_h = self._compute_power(var, mat_g_updated, shape[i], neg_alpha)
        else:
          mat_h = control_flow_ops.cond(
              math_ops.mod(global_step, svd_interval) < 1,
              lambda: self._compute_power(var, mat_g_updated, shape[i],
                                          neg_alpha, "H_" + str(i)),
              lambda: self.get_slot(var, "H_" + str(i)))

        # mat_h is a square matrix of size d_i x d_i
        # preconditioned_grad is a d_i x ... x d_n x d_0 x ... d_{i-1} tensor
        # After contraction with a d_i x d_i tensor
        # it becomes a d_{i+1} x ... x d_n x d_0 x ... d_i tensor
        # (the first dimension is contracted out, and the second dimension of
        # mat_h is appended).  After going through all the indices, it becomes
        # a d_0 x ... x d_n tensor again.
        preconditioned_grad = math_ops.tensordot(preconditioned_grad, mat_h,
                                                 axes=([0], [0]),
                                                 name="precond_" + str(i))
      else:
        # Tensor size is too large -- perform diagonal Shampoo update
        grad_outer = math_ops.reduce_sum(grad * grad, axis=axes)
        if i == 0 and indices is not None:
          assert self._mat_gbar_decay == 1.0
          mat_g_updated = state_ops.scatter_add(mat_g, indices,
                                                mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(
              array_ops.gather(mat_g_updated, indices) + self._epsilon,
              neg_alpha)
        else:
          mat_g_updated = self._weighted_average(mat_g,
                                                 self._mat_gbar_decay,
                                                 mat_gbar_decay_t,
                                                 mat_gbar_weight_t * grad_outer)
          mat_h = math_ops.pow(mat_g_updated + self._epsilon, neg_alpha)

        # Need to do the transpose to ensure that the tensor becomes
        # a d_{i+1} x ... x d_n x d_0 x ... d_i tensor as described above.
        preconditioned_grad = array_ops.transpose(
            preconditioned_grad, perm=list(range(1, v_rank)) + [0]) * mat_h

    # Update the variable based on the Shampoo update
    learning_rate_t = GetParam(self._learning_rate, global_step)
    if indices is not None:
      var_updated = state_ops.scatter_add(
          var, indices, -learning_rate_t * preconditioned_grad)
    else:
      var_updated = state_ops.assign_sub(var,
                                         learning_rate_t * preconditioned_grad)
    return var_updated
Exemple #57
0
 def assign_sub(self, delta, use_locking=False):
   return state_ops.assign_sub(self._variable, delta, use_locking=use_locking)