def _apply_dense(self, grad, var): # scaled_lr = self._learning_rate scaled_lr = self._get_hyper("learning_rate") decayed_grad = grad tf_logging.info("LARS: apply dense: %s", var.name) if 'batch_normalization' not in var.name and 'bias' not in var.name: tf_logging.info("LARS: apply dense, decay: %s", var.name) w_norm = tf.norm(var, ord=2) g_norm = tf.norm(grad, ord=2) trust_ratio = tf.where( tf.math.greater(w_norm, 0), tf.where( tf.math.greater(g_norm, 0), (self._eeta * w_norm / (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0), 1.0) trust_ratio = tf.clip_by_value(trust_ratio, 0.0, 50) scaled_lr *= trust_ratio decayed_grad = grad + self._weight_decay * var decayed_grad = tf.clip_by_value(decayed_grad, -10.0, 10.0) mom = self.get_slot(var, "momentum") return training_ops.apply_momentum(var, mom, scaled_lr, decayed_grad, self._momentum, use_locking=False, use_nesterov=False)
def _apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), use_locking=self._use_locking).op
def _apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.apply_momentum(var, mom, self._learning_rate_tensor, grad, self._momentum_tensor, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), use_locking=self._use_locking).op
def _apply_dense(self, grad, var, state): mom = state.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op
def _apply_dense(self, grad, var): mom = self._zdic[var.name] return training_ops.apply_momentum( var, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=False).op
def _apply_dense(self, grad, var): scaled_lr = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.apply_momentum(var, mom, scaled_lr, grad, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def _apply_dense(self, grad, var, state): mom = state.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op
def _apply_dense(self, grad, var): scaled_lr = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, scaled_lr, grad, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def _apply_dense(self, grad, var): scaled_lr, grad = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.apply_momentum(var, mom, math_ops.cast( 1.0, var.dtype.base_dtype), grad * scaled_lr, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def momentum_apply_dense(self, grad, var): mom = self.get_slot(var, "m") #grad = tf.Print(grad,["B",tf.train.get_or_create_global_step(),self._switch_step]) return training_ops.apply_momentum( var, mom, math_ops.cast(self._lr_t, var.dtype.base_dtype), grad, math_ops.cast(self._beta1_t, var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op
def _apply_dense(self, grad, var): vec = self.get_slot(var, "velocity") lr, mom, locking, nesterov = self._params_for_var(var) return training_ops.apply_momentum( var, vec, math_ops.cast(lr, var.dtype.base_dtype), grad, math_ops.cast(mom, var.dtype.base_dtype), use_locking=locking, use_nesterov=nesterov).op
def _apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") apply_momentum_foo = training_ops.apply_momentum( var, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov).op # print('mom', apply_momentum_foo.get_shape()) return self.soft_tfresholding(apply_momentum_foo, THRESHOLD)
def _apply_dense(self, grad, var): scaled_lr, grad = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") momentum = self._serialize_hyperparameter("momentum") use_nesterov = bool(self._serialize_hyperparameter("use_nesterov")) return training_ops.apply_momentum( var, mom, math_ops.cast(1.0, var.dtype.base_dtype), grad * scaled_lr, momentum, use_locking=False, use_nesterov=use_nesterov, )
def _apply_dense_without_v(self, var, m, v, beta1_power, beta1, beta2, grad): lr_t = math_ops.cast(self._lr_t, var.dtype.base_dtype) update_v = v.assign_sub((1 - beta2) * (v - tf.square(grad)), use_locking=self._use_locking) var_update = training_ops.apply_momentum( var, m, lr_t / (1 - beta1_power), # to adapt adam formula grad * (1 - beta1), # to adapt adam formula beta1, use_locking=self._use_locking, ).op return control_flow_ops.group(var_update, update_v)
def _apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) scaled_lr, grad = self.compute_lr(grad, var, coefficients) mom = self.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, tf.cast(1.0, var.dtype.base_dtype), grad * scaled_lr, self.momentum, use_locking=False, use_nesterov=self.use_nesterov)
def _apply_dense(self, grad, var): momentum_buffer = self.get_slot(var, "momentum") learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype) nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype) momentum_op = training_ops.apply_momentum( var, momentum_buffer, nu * (1.0 - momentum) * learning_rate, grad, momentum, use_locking=self._use_locking, use_nesterov=False, ).op with ops.control_dependencies([momentum_op]): gd_op = training_ops.apply_gradient_descent( var, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking ).op return control_flow_ops.group(momentum_op, gd_op)
def _apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.apply_momentum( var, mom, self._learning_rate_tensor, grad, self._momentum_tensor, use_locking=self._use_locking).op