def _resource_apply_dense(self, grad, var, apply_state=None): # # here is the change from SGD with momentum # print(f"orig var: {var}") K.set_value(var, K.clip(var, min_value=-1.0, max_value=1.0)) # print(f"set var: {var}") var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): # pylint: disable=no-name-in-module,import-error from tensorflow.python.training import training_ops var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get( (var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype) if var.name in self.lr_multipliers: lr_t = coefficients["lr_t"] * self.lr_multipliers[var.name] else: lr_t = coefficients["lr_t"] if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov, ) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) scaled_lr, grad = self.compute_lr(grad, var, coefficients) mom = self.get_slot(var, "momentum") # ============================================================ return training_ops.resource_apply_keras_momentum( var.handle, mom.handle, scaled_lr, grad, self.momentum, use_locking=False, use_nesterov=self.use_nesterov)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get( (var_device, var_dtype) ) or self._fallback_apply_state(var_device, var_dtype) weight_decay = self._get_hyper("weight_decay") grad_averaging = self._get_hyper("grad_averaging") v = self.get_slot(var, "v") g_2 = tf.reduce_sum(tf.square(tf.cast(grad, tf.float32))) v_t = tf.cond( tf.equal(self.iterations, 0), lambda: g_2, lambda: v * coefficients["beta_2_t"] + g_2 * coefficients["one_minus_beta_2_t"], ) v_t = v.assign(v_t, use_locking=self._use_locking) if self.amsgrad: vhat = self.get_slot(var, "vhat") vhat_t = vhat.assign(tf.maximum(vhat, v_t), use_locking=self._use_locking) grad = grad / (tf.sqrt(vhat_t) + self.epsilon) else: grad = grad / (tf.sqrt(v_t) + self.epsilon) grad = tf.cond( tf.greater(weight_decay, 0), lambda: grad + weight_decay * var, lambda: grad ) grad = tf.cond( tf.logical_and(grad_averaging, tf.not_equal(self.iterations, 0)), lambda: grad * coefficients["one_minus_beta_1_t"], lambda: grad, ) m = self.get_slot(var, "m") return training_ops.resource_apply_keras_momentum( var.handle, m.handle, coefficients["lr_t"], grad, coefficients["beta_1_t"], use_locking=self._use_locking, use_nesterov=False, )
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype ) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov, ) else: lr = coefficients["lr_t"] if str(var.name).find("transpose") != -1: lr = constant(self._serialize_hyperparameter("learning_rate_deconv")) return training_ops.resource_apply_gradient_descent(var.handle, lr, grad, use_locking=self._use_locking)