def _resource_apply_dense(self, grad, var, apply_state=None): # pylint: disable=no-name-in-module,import-error from tensorflow.python.training import training_ops var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get( (var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype) if var.name in self.lr_multipliers: lr_t = coefficients["lr_t"] * self.lr_multipliers[var.name] else: lr_t = coefficients["lr_t"] if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov, ) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) var_dtype = var.dtype.base_dtype alpha = self._get_hyper('learning_rate', var_dtype) if self._is_first: momentum = grad H_inv = tf.ones(var.shape, var_dtype) self.get_slot(var, 'dh').assign(H_inv) self._is_first = False else: momentum = self.get_slot(var, 'g_mom') momentum_new = self._get_hyper( 'momentum', var_dtype) * momentum + ( 1 - self._get_hyper('momentum', var_dtype)) * grad momentum.assign(momentum_new) dh = self.get_slot(var, 'dh') ones = tf.ones(var.shape) H_inv = tf.maximum( tf.math.reciprocal_no_nan(dh), tf.multiply(ones, self._get_hyper('epsilon', var_dtype) / alpha)) H_inv = tf.minimum( H_inv, tf.multiply(ones, self._get_hyper('gamma', var_dtype) / alpha)) update = alpha * H_inv * momentum self.get_slot(var, 'update').assign(update) return training_ops.resource_apply_gradient_descent( var.handle, 1.0, update, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): # # here is the change from SGD with momentum # print(f"orig var: {var}") K.set_value(var, K.clip(var, min_value=-1.0, max_value=1.0)) # print(f"set var: {var}") var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, handle): return training_ops.resource_apply_gradient_descent( handle.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), grad, use_locking=self._use_locking, )
def _resource_apply_dense(self, grad, var): rms = self.get_slot(var, 'rms') new_grad = self._apply_noisy_update(rms, grad, var) return training_ops.resource_apply_gradient_descent( var.handle, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), new_grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): if 'batch_norm/moving_' in var.name: # pose update as a gradient descent update to ensure compatibility with # gradient accumulation v1 op return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(self._update_step, grad.dtype.base_dtype), grad, use_locking=self._use_locking) else: return super()._resource_apply_dense(grad, var)
def _resource_apply_dense(self, grad, var, state): if self._use_momentum: mom = state.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov) else: lr = state.get_hyper("learning_rate", grad.dtype.base_dtype) return training_ops.resource_apply_gradient_descent( var.handle, lr, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, state): if self._use_momentum: mom = state.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, mom.handle, state.get_hyper("learning_rate", var.dtype.base_dtype), grad, state.get_hyper("momentum", var.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._use_nesterov) else: lr = state.get_hyper("learning_rate", grad.dtype.base_dtype) return training_ops.resource_apply_gradient_descent( var.handle, lr, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): max_learning_rate = tf.where( self.iterations < tf.cast(self._burnin, tf.int64), self._burnin_max_learning_rate, self._max_learning_rate) learn_rates = tf.clip_by_value( self._get_coordinatewise_learning_rate(grad, var), 0., tf.cast(max_learning_rate, var.dtype.base_dtype)) newgrad = grad * learn_rates return training_ops.resource_apply_gradient_descent( var.handle, tf.cast(1., var.dtype), newgrad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, lr_t, grad, self._get_hyper("momentum", var_dtype), use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, lr_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, coefficients["lr_t"], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): learning_rate = self._get_hyper("learning_rate") if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var): learning_rate = self._get_hyper("learning_rate") if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_momentum( var.handle, momentum_var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, math_ops.cast(self._get_hyper("momentum"), grad.dtype.base_dtype), use_locking=self._use_locking, use_nesterov=self._nesterov) else: return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(learning_rate, grad.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): #print("_resource_apply_dense") variable_name = var.name var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) if self._alpha_func != None: training_ops.resource_apply_gradient_descent( self._alpha_dict[variable_name].handle, tf.constant(1.0), self._alpha_func(var.shape, self._alpha_dict[variable_name], grad), use_locking=self._use_locking) if self._beta_func != None: training_ops.resource_apply_gradient_descent( self._beta_dict[variable_name].handle, tf.constant(1.0), self._beta_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], grad), use_locking=self._use_locking) if self._sigma_func!= None: training_ops.resource_apply_gradient_descent( self._sigma_dict[variable_name].handle, tf.constant(1.0), self._sigma_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], self._sigma_dict[variable_name], grad), use_locking=self._use_locking) foo = training_ops.resource_apply_gradient_descent( var.handle, tf.constant(1.0), self._grad_func(var.shape, self._alpha_dict[variable_name], self._beta_dict[variable_name], self._sigma_dict[variable_name], grad), use_locking=self._use_locking) return foo
def _resource_apply_dense(self, grad, var): momentum_buffer = self.get_slot(var, "momentum") learning_rate = math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype) momentum = math_ops.cast(self._momentum_tensor, var.dtype.base_dtype) nu = math_ops.cast(self._nu_tensor, var.dtype.base_dtype) momentum_op = training_ops.resource_apply_momentum( var.handle, momentum_buffer.handle, nu * (1.0 - momentum) * learning_rate, grad, momentum, use_locking=self._use_locking, use_nesterov=False, ) with ops.control_dependencies([momentum_op]): gd_op = training_ops.resource_apply_gradient_descent( var.handle, (1.0 - nu) * learning_rate, grad, use_locking=self._use_locking ) return control_flow_ops.group(momentum_op, gd_op)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype ) if self._momentum: momentum_var = self.get_slot(var, "momentum") return training_ops.resource_apply_keras_momentum( var.handle, momentum_var.handle, coefficients["lr_t"], grad, coefficients["momentum"], use_locking=self._use_locking, use_nesterov=self.nesterov, ) else: lr = coefficients["lr_t"] if str(var.name).find("transpose") != -1: lr = constant(self._serialize_hyperparameter("learning_rate_deconv")) return training_ops.resource_apply_gradient_descent(var.handle, lr, grad, use_locking=self._use_locking)
def apply_proxy_gradients(self, proxy_grads_and_vars): lengths = [ v.get_shape().as_list()[-1] for g, v in proxy_grads_and_vars ] grads, vars = zip(*proxy_grads_and_vars) with ops.init_scope(): self._create_slots(vars) all_grads = tf.concat(grads, axis=-1) all_vars = tf.concat(vars, axis=-1, name='Proxy_concat') rand = tf.constant(get_rand(), dtype=self._dtype) rand = tf.reshape(rand, shape=(1, 1, 1, 1, -1)) all_grads = self.proxy_bw(all_vars, all_grads, rand) grads = tf.split(all_grads, lengths, axis=-1) updated_vars = [] for grad, var in zip(grads, vars): updated_var = super()._apply_weight_update(grad, var) updated_vars.append(updated_var) updated_vars = tf.concat(updated_vars, axis=-1) # do proxy forward pass for the next step updated_vars = self.proxy_fw(updated_vars, rand) updated_vars = tf.split(updated_vars, lengths, axis=-1) grads_and_vars = [(v - up_v, v) for v, up_v in zip(vars, updated_vars)] return tf.group([ training_ops.resource_apply_gradient_descent( var.handle, tf.constant(1.0, grad.dtype.base_dtype), grad, use_locking=self._use_locking) for grad, var in grads_and_vars ])
def _resource_apply_dense(self, grad, var): return training_ops.resource_apply_gradient_descent( var.handle, math_ops.cast(self._get_hyper("learning_rate"), var.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, handle, state): lr = state.get_hyper("learning_rate", grad.dtype.base_dtype) return training_ops.resource_apply_gradient_descent( handle.handle, lr, grad, use_locking=self._use_locking)
def _wrapWOAccu(accuGrads, grad, var, apply_state): return training_ops.resource_apply_gradient_descent( var.handle, 0.0, grad * 0.0, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, handle): return training_ops.resource_apply_gradient_descent( handle.handle, math_ops.cast(self._learning_rate_tensor, grad.dtype.base_dtype), grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, handle, state): lr = state.get_hyper("learning_rate", grad.dtype.base_dtype) return training_ops.resource_apply_gradient_descent( handle.handle, lr, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype return training_ops.resource_apply_gradient_descent( var.handle, 1.0, grad, use_locking=self._use_locking)