Beispiel #1
0
    def _resource_apply_dense(self, grad, var):
        # scaled_lr = self._learning_rate
        scaled_lr = self._get_hyper("learning_rate")
        print(scaled_lr)
        decayed_grad = grad
        tf_logging.info("LARS: resouce apply dense: %s", var.name)
        w_norm = tf.norm(var, ord=2)
        g_norm = tf.norm(grad, ord=2)
        if 'batch_normalization' not in var.name and 'bias' not in var.name:
            tf_logging.info("LARS: apply dense, decay: %s", var.name)
            trust_ratio = tf.where(
                tf.math.greater(w_norm, 0),
                tf.where(
                    tf.math.greater(g_norm, 0),
                    (self._eeta * w_norm /
                     (g_norm + self._weight_decay * w_norm + self._epsilon)),
                    1.0), 1.0)
            trust_ratio = tf.clip_by_value(trust_ratio, 0.0, 50)
            scaled_lr *= trust_ratio
            decayed_grad = grad + self._weight_decay * var

        decayed_grad = tf.clip_by_value(decayed_grad, -10.0, 10.0)
        mom = self.get_slot(var, "momentum")
        return training_ops.resource_apply_momentum(var.handle,
                                                    mom.handle,
                                                    scaled_lr,
                                                    decayed_grad,
                                                    self._momentum,
                                                    use_locking=False,
                                                    use_nesterov=False)
Beispiel #2
0
    def _resource_apply_dense(self, grad, var, apply_state=None):
        coefficients = self.update_coefficients(var, apply_state)
        eeta = coefficients['eeta']
        lr_t = coefficients['lr_t']
        epsilon = coefficients['epsilon']
        momentum = coefficients['momentum']
        weight_decay = coefficients['weight_decay']

        momentum_var = self.get_slot(var, 'momentum_var')

        compute_dtype = self.opt_dtypes[0]

        trust_ratio, grad = self.compute_trust_ratio(grad, var, eeta, epsilon,
                                                     weight_decay,
                                                     compute_dtype)
        scaled_lr = lr_t * trust_ratio

        return training_ops.resource_apply_momentum(
            var=var.handle,
            accum=momentum_var.handle,
            lr=math_ops.cast(1.0, var.dtype.base_dtype),
            grad=math_ops.cast(grad * scaled_lr, var.dtype.base_dtype),
            momentum=math_ops.cast(momentum, var.dtype.base_dtype),
            use_locking=False,
            use_nesterov=self.use_nesterov)
 def _resource_apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   return training_ops.resource_apply_momentum(
       var.handle, mom.handle,
       math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
       grad,
       math_ops.cast(self._momentum_tensor, grad.dtype.base_dtype),
       use_locking=self._use_locking,
       use_nesterov=self._use_nesterov)
Beispiel #4
0
 def _resource_apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   return training_ops.resource_apply_momentum(
       var.handle, mom.handle,
       math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype),
       grad,
       math_ops.cast(self._momentum_tensor, grad.dtype.base_dtype),
       use_locking=self._use_locking,
       use_nesterov=self._use_nesterov)
Beispiel #5
0
 def _resource_apply_dense(self, grad, var, state):
     mom = state.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         grad,
         state.get_hyper("momentum", var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
Beispiel #6
0
 def _resource_apply_dense(self, grad, var, state):
   mom = state.get_slot(var, "momentum")
   return training_ops.resource_apply_momentum(
       var.handle,
       mom.handle,
       state.get_hyper("learning_rate", var.dtype.base_dtype),
       grad,
       state.get_hyper("momentum", var.dtype.base_dtype),
       use_locking=self._use_locking,
       use_nesterov=self._use_nesterov)
Beispiel #7
0
 def _resource_apply_dense(self, grad, var):
     scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         math_ops.cast(1.0, var.dtype.base_dtype),
         grad * scaled_lr,
         self._momentum,
         use_locking=False,
         use_nesterov=self._use_nesterov)
 def _resource_apply_dense(self, grad, var):
     scaled_lr = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         scaled_lr,
         grad,
         self._momentum,
         use_locking=False,
         use_nesterov=self._use_nesterov)
 def _resource_apply_dense(self, grad, var):
   scaled_lr = self.compute_lr(grad, var)
   mom = self.get_slot(var, "momentum")
   return training_ops.resource_apply_momentum(
       var.handle,
       mom.handle,
       scaled_lr,
       grad,
       self._momentum,
       use_locking=False,
       use_nesterov=self._use_nesterov)
 def _resource_apply_dense(self, grad, var):
     dtype = var.dtype.base_dtype
     scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, 'momentum')
     momentum = self._get_hyper('momentum', dtype)
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         tf.cast(1, dtype),
         grad * scaled_lr,
         momentum,
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
Beispiel #11
0
 def _resource_apply_dense(self, grad, var):
     scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     momentum = self._serialize_hyperparameter("momentum")
     use_nesterov = bool(self._serialize_hyperparameter("use_nesterov"))
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         math_ops.cast(1.0, var.dtype.base_dtype),
         grad * scaled_lr,
         momentum,
         use_locking=False,
         use_nesterov=use_nesterov,
     )
 def _resource_apply_sparse(self, grad, var, indices):
     logging.info('fallback to momentum optimizer for sparse tensors')
     dtype = var.dtype.base_dtype
     learning_rate = self._get_hyper('learning_rate', dtype)
     mom = self.get_slot(var, 'momentum')
     momentum = self._get_hyper('momentum', dtype)
     return training_ops.resource_apply_momentum(
         var.handle,
         mom.handle,
         learning_rate,
         grad,
         indices,
         momentum,
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov)
Beispiel #13
0
    def _resource_apply_dense_without_v(self, var, m, v, beta1_power, beta1,
                                        beta2, grad):
        lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype)
        update_v = v.assign_sub((1 - beta2) * (v - tf.square(grad)),
                                use_locking=self._use_locking)

        var_update = training_ops.resource_apply_momentum(
            var,
            m,
            lr_t / (1 - beta1_power),  # to adapt adam formula
            grad * (1 - beta1),  # to adapt adam formula
            beta1,
            use_locking=self._use_locking,
        )
        return control_flow_ops.group(var_update, update_v)
 def _resource_apply_dense(self, grad, var):
   var_dtype = var.dtype.base_dtype
   lr_t = self._decayed_lr(var_dtype)
   if self._momentum:
     momentum_var = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         momentum_var.handle,
         lr_t,
         grad,
         self._get_hyper("momentum", var_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._nesterov)
   else:
     return training_ops.resource_apply_gradient_descent(
         var.handle, lr_t, grad, use_locking=self._use_locking)
Beispiel #15
0
 def _resource_apply_dense(self, grad, var):
     var_dtype = var.dtype.base_dtype
     lr_t = self._decayed_lr(var_dtype)
     if self._momentum:
         momentum_var = self.get_slot(var, "momentum")
         return training_ops.resource_apply_momentum(
             var.handle,
             momentum_var.handle,
             lr_t,
             grad,
             self._get_hyper("momentum", var_dtype),
             use_locking=self._use_locking,
             use_nesterov=self._nesterov)
     else:
         return training_ops.resource_apply_gradient_descent(
             var.handle, lr_t, grad, use_locking=self._use_locking)
  def _resource_apply_dense(self, grad, var, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    if self._momentum:
      momentum_var = self.get_slot(var, "momentum")
      return training_ops.resource_apply_momentum(
          var.handle,
          momentum_var.handle,
          coefficients["lr_t"],
          grad,
          coefficients["momentum"],
          use_locking=self._use_locking,
          use_nesterov=self.nesterov)
    else:
      return training_ops.resource_apply_gradient_descent(
          var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
 def _resource_apply_dense(self, grad, var):
   learning_rate = self._get_hyper("learning_rate")
   if self._momentum:
     momentum_var = self.get_slot(var, "momentum")
     return training_ops.resource_apply_momentum(
         var.handle,
         momentum_var.handle,
         math_ops.cast(learning_rate, grad.dtype.base_dtype),
         grad,
         math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._nesterov)
   else:
     return training_ops.resource_apply_gradient_descent(
         var.handle,
         math_ops.cast(learning_rate, grad.dtype.base_dtype),
         grad,
         use_locking=self._use_locking)
 def _resource_apply_dense(self, grad, var):
     learning_rate = self._get_hyper("learning_rate")
     if self._momentum:
         momentum_var = self.get_slot(var, "momentum")
         return training_ops.resource_apply_momentum(
             var.handle,
             momentum_var.handle,
             math_ops.cast(learning_rate, grad.dtype.base_dtype),
             grad,
             math_ops.cast(self._get_hyper("momentum"),
                           grad.dtype.base_dtype),
             use_locking=self._use_locking,
             use_nesterov=self._nesterov)
     else:
         return training_ops.resource_apply_gradient_descent(
             var.handle,
             math_ops.cast(learning_rate, grad.dtype.base_dtype),
             grad,
             use_locking=self._use_locking)
Beispiel #19
0
    def _resource_apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.resource_apply_momentum(
            var.handle,
            momentum_buffer.handle,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        )

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.resource_apply_gradient_descent(
                var.handle, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            )

        return control_flow_ops.group(momentum_op, gd_op)