def _resource_apply_dense(self, grad, var): # scaled_lr = self._learning_rate scaled_lr = self._get_hyper("learning_rate") print(scaled_lr) decayed_grad = grad tf_logging.info("LARS: resouce apply dense: %s", var.name) w_norm = tf.norm(var, ord=2) g_norm = tf.norm(grad, ord=2) if 'batch_normalization' not in var.name and 'bias' not in var.name: tf_logging.info("LARS: apply dense, decay: %s", var.name) trust_ratio = tf.where( tf.math.greater(w_norm, 0), tf.where( tf.math.greater(g_norm, 0), (self._eeta * w_norm / (g_norm + self._weight_decay * w_norm + self._epsilon)), 1.0), 1.0) trust_ratio = tf.clip_by_value(trust_ratio, 0.0, 50) scaled_lr *= trust_ratio decayed_grad = grad + self._weight_decay * var decayed_grad = tf.clip_by_value(decayed_grad, -10.0, 10.0) mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum(var.handle, mom.handle, scaled_lr, decayed_grad, self._momentum, use_locking=False, use_nesterov=False)
def _resource_apply_dense(self, grad, var, apply_state=None): coefficients = self.update_coefficients(var, apply_state) eeta = coefficients['eeta'] lr_t = coefficients['lr_t'] epsilon = coefficients['epsilon'] momentum = coefficients['momentum'] weight_decay = coefficients['weight_decay'] momentum_var = self.get_slot(var, 'momentum_var') compute_dtype = self.opt_dtypes[0] trust_ratio, grad = self.compute_trust_ratio(grad, var, eeta, epsilon, weight_decay, compute_dtype) scaled_lr = lr_t * trust_ratio return training_ops.resource_apply_momentum( var=var.handle, accum=momentum_var.handle, lr=math_ops.cast(1.0, var.dtype.base_dtype), grad=math_ops.cast(grad * scaled_lr, var.dtype.base_dtype), momentum=math_ops.cast(momentum, var.dtype.base_dtype), use_locking=False, use_nesterov=self.use_nesterov)
def _resource_apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), grad, math_ops.cast(self._momentum_tensor, grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var, state): mom = state.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var, state): mom = state.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): scaled_lr, grad = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, math_ops.cast(1.0, var.dtype.base_dtype), grad * scaled_lr, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): scaled_lr = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, scaled_lr, grad, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): scaled_lr = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, scaled_lr, grad, self._momentum, use_locking=False, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): dtype = var.dtype.base_dtype scaled_lr, grad = self.compute_lr(grad, var) mom = self.get_slot(var, 'momentum') momentum = self._get_hyper('momentum', dtype) return training_ops.resource_apply_momentum( var.handle, mom.handle, tf.cast(1, dtype), grad * scaled_lr, momentum, use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense(self, grad, var): scaled_lr, grad = self.compute_lr(grad, var) mom = self.get_slot(var, "momentum") momentum = self._serialize_hyperparameter("momentum") use_nesterov = bool(self._serialize_hyperparameter("use_nesterov")) return training_ops.resource_apply_momentum( var.handle, mom.handle, math_ops.cast(1.0, var.dtype.base_dtype), grad * scaled_lr, momentum, use_locking=False, use_nesterov=use_nesterov, )
def _resource_apply_sparse(self, grad, var, indices): logging.info('fallback to momentum optimizer for sparse tensors') dtype = var.dtype.base_dtype learning_rate = self._get_hyper('learning_rate', dtype) mom = self.get_slot(var, 'momentum') momentum = self._get_hyper('momentum', dtype) return training_ops.resource_apply_momentum( var.handle, mom.handle, learning_rate, grad, indices, momentum, use_locking=self._use_locking, use_nesterov=self._use_nesterov)
def _resource_apply_dense_without_v(self, var, m, v, beta1_power, beta1, beta2, grad): lr_t = math_ops.cast(self._lr_t, grad.dtype.base_dtype) update_v = v.assign_sub((1 - beta2) * (v - tf.square(grad)), use_locking=self._use_locking) var_update = training_ops.resource_apply_momentum( var, m, lr_t / (1 - beta1_power), # to adapt adam formula grad * (1 - beta1), # to adapt adam formula beta1, use_locking=self._use_locking, ) return control_flow_ops.group(var_update, update_v)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): learning_rate = self._get_hyper("learning_rate") if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): learning_rate = self._get_hyper("learning_rate") if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): momentum_buffer = self.get_slot(var, "momentum") learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype) nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype) momentum_op = training_ops.resource_apply_momentum( var.handle, momentum_buffer.handle, nu * (1.0 - momentum) * learning_rate, grad, momentum, use_locking=self._use_locking, use_nesterov=False, ) with ops.control_dependencies([momentum_op]): gd_op = training_ops.resource_apply_gradient_descent( var.handle, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking ) return control_flow_ops.group(momentum_op, gd_op)