Ejemplo n.º 1
0
    def _apply_dense(self, grad, var):
        # scaled_lr = self._learning_rate
        scaled_lr = self._get_hyper("learning_rate")
        decayed_grad = grad
        tf_logging.info("LARS: apply dense: %s", var.name)
        if 'batch_normalization' not in var.name and 'bias' not in var.name:
            tf_logging.info("LARS: apply dense, decay: %s", var.name)
            w_norm = tf.norm(var, ord=2)
            g_norm = tf.norm(grad, ord=2)
            trust_ratio = tf.where(
                tf.math.greater(w_norm, 0),
                tf.where(
                    tf.math.greater(g_norm, 0),
                    (self._eeta * w_norm /
                     (g_norm + self._weight_decay * w_norm + self._epsilon)),
                    1.0), 1.0)
            trust_ratio = tf.clip_by_value(trust_ratio, 0.0, 50)
            scaled_lr *= trust_ratio
            decayed_grad = grad + self._weight_decay * var

        decayed_grad = tf.clip_by_value(decayed_grad, -10.0, 10.0)
        mom = self.get_slot(var, "momentum")
        return training_ops.apply_momentum(var,
                                           mom,
                                           scaled_lr,
                                           decayed_grad,
                                           self._momentum,
                                           use_locking=False,
                                           use_nesterov=False)
Ejemplo n.º 2
0
 def _apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_momentum(
       var, mom,
       math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
       grad,
       math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
       use_locking=self._use_locking).op
Ejemplo n.º 3
0
 def _apply_dense(self, grad, var):
     mom = self.get_slot(var, "momentum")
     return training_ops.apply_momentum(var,
                                        mom,
                                        self._learning_rate_tensor,
                                        grad,
                                        self._momentum_tensor,
                                        use_locking=self._use_locking).op
Ejemplo n.º 4
0
 def _apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_momentum(
       var, mom,
       math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
       grad,
       math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
       use_locking=self._use_locking).op
Ejemplo n.º 5
0
 def _apply_dense(self, grad, var, state):
     mom = state.get_slot(var, "momentum")
     return training_ops.apply_momentum(
         var,
         mom,
         state.get_hyper("learning_rate", var.dtype.base_dtype),
         grad,
         state.get_hyper("momentum", var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov).op
Ejemplo n.º 6
0
 def _apply_dense(self, grad, var):
     mom = self._zdic[var.name]
     return training_ops.apply_momentum(
         var,
         mom,
         math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
         grad,
         math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=False).op
Ejemplo n.º 7
0
 def _apply_dense(self, grad, var):
     scaled_lr = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.apply_momentum(var,
                                        mom,
                                        scaled_lr,
                                        grad,
                                        self._momentum,
                                        use_locking=False,
                                        use_nesterov=self._use_nesterov)
Ejemplo n.º 8
0
 def _apply_dense(self, grad, var, state):
   mom = state.get_slot(var, "momentum")
   return training_ops.apply_momentum(
       var,
       mom,
       state.get_hyper("learning_rate", var.dtype.base_dtype),
       grad,
       state.get_hyper("momentum", var.dtype.base_dtype),
       use_locking=self._use_locking,
       use_nesterov=self._use_nesterov).op
Ejemplo n.º 9
0
 def _apply_dense(self, grad, var):
   scaled_lr = self.compute_lr(grad, var)
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_momentum(
       var,
       mom,
       scaled_lr,
       grad,
       self._momentum,
       use_locking=False,
       use_nesterov=self._use_nesterov)
Ejemplo n.º 10
0
 def _apply_dense(self, grad, var):
     scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     return training_ops.apply_momentum(var,
                                        mom,
                                        math_ops.cast(
                                            1.0, var.dtype.base_dtype),
                                        grad * scaled_lr,
                                        self._momentum,
                                        use_locking=False,
                                        use_nesterov=self._use_nesterov)
Ejemplo n.º 11
0
 def momentum_apply_dense(self, grad, var):
     mom = self.get_slot(var, "m")
     #grad = tf.Print(grad,["B",tf.train.get_or_create_global_step(),self._switch_step])
     return training_ops.apply_momentum(
         var,
         mom,
         math_ops.cast(self._lr_t, var.dtype.base_dtype),
         grad,
         math_ops.cast(self._beta1_t, var.dtype.base_dtype),
         use_locking=self._use_locking,
         use_nesterov=self._use_nesterov).op
Ejemplo n.º 12
0
 def _apply_dense(self, grad, var):
     vec = self.get_slot(var, "velocity")
     lr, mom, locking, nesterov = self._params_for_var(var)
     return training_ops.apply_momentum(
         var,
         vec,
         math_ops.cast(lr, var.dtype.base_dtype),
         grad,
         math_ops.cast(mom, var.dtype.base_dtype),
         use_locking=locking,
         use_nesterov=nesterov).op
 def _apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   apply_momentum_foo = training_ops.apply_momentum(
       var, mom,
       math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype),
       grad,
       math_ops.cast(self._momentum_tensor, var.dtype.base_dtype),
       use_locking=self._use_locking,
       use_nesterov=self._use_nesterov).op
   # print('mom', apply_momentum_foo.get_shape())
   return self.soft_tfresholding(apply_momentum_foo, THRESHOLD)
Ejemplo n.º 14
0
 def _apply_dense(self, grad, var):
     scaled_lr, grad = self.compute_lr(grad, var)
     mom = self.get_slot(var, "momentum")
     momentum = self._serialize_hyperparameter("momentum")
     use_nesterov = bool(self._serialize_hyperparameter("use_nesterov"))
     return training_ops.apply_momentum(
         var,
         mom,
         math_ops.cast(1.0, var.dtype.base_dtype),
         grad * scaled_lr,
         momentum,
         use_locking=False,
         use_nesterov=use_nesterov,
     )
Ejemplo n.º 15
0
    def _apply_dense_without_v(self, var, m, v, beta1_power, beta1, beta2,
                               grad):
        lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype)
        update_v = v.assign_sub((1 - beta2) * (v - tf.square(grad)),
                                use_locking=self._use_locking)

        var_update = training_ops.apply_momentum(
            var,
            m,
            lr_t / (1 - beta1_power),  # to adapt adam formula
            grad * (1 - beta1),  # to adapt adam formula
            beta1,
            use_locking=self._use_locking,
        ).op
        return control_flow_ops.group(var_update, update_v)
Ejemplo n.º 16
0
  def _apply_dense(self, grad, var, apply_state=None):
    var_device, var_dtype = var.device, var.dtype.base_dtype
    coefficients = ((apply_state or {}).get((var_device, var_dtype))
                    or self._fallback_apply_state(var_device, var_dtype))

    scaled_lr, grad = self.compute_lr(grad, var, coefficients)
    mom = self.get_slot(var, "momentum")
    return training_ops.apply_momentum(
        var,
        mom,
        tf.cast(1.0, var.dtype.base_dtype),
        grad * scaled_lr,
        self.momentum,
        use_locking=False,
        use_nesterov=self.use_nesterov)
Ejemplo n.º 17
0
    def _apply_dense(self, grad, var):
        momentum_buffer = self.get_slot(var, "momentum")
        learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype)
        momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype)
        nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype)

        momentum_op = training_ops.apply_momentum(
            var,
            momentum_buffer,
            nu * (1.0 - momentum) * learning_rate,
            grad,
            momentum,
            use_locking=self._use_locking,
            use_nesterov=False,
        ).op

        with ops.control_dependencies([momentum_op]):
            gd_op = training_ops.apply_gradient_descent(
                var, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking
            ).op

        return control_flow_ops.group(momentum_op, gd_op)
Ejemplo n.º 18
0
 def _apply_dense(self, grad, var):
   mom = self.get_slot(var, "momentum")
   return training_ops.apply_momentum(
       var, mom,
       self._learning_rate_tensor, grad, self._momentum_tensor,
       use_locking=self._use_locking).op