def _wrapUseAccu(accuGrads, grad, var, apply_state): m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if not self.amsgrad: result = training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') result = training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) accuGrads.assign(tf.broadcast_to(0.0, tf.shape(accuGrads))) return result
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon = self._get_hyper('epsilon', var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) # st() if "reader" in var.name: lr_t = lr_t * 0.2 elif "h_mean" in var.name: lr_t = lr_t * 0.1 elif "h_var" in var.name: lr_t = lr_t * 0.1 elif "box_vae" in var.name: lr_t = lr_t * 10 elif "offset_vae" in var.name: lr_t = lr_t * 10 if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon, grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): # print("grad: ", grad.name, grad.shape, "var: ", var.name, var.shape) var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get( (var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype) lr = self._find_lr(var.name, coefficients) # lr_idx = -1 # for i, pattern_lr in enumerate(self.pattern_lrs): # for pattern in pattern_lr["patterns"]: # print("pattern: ", pattern, re.search(pattern, var.name)) # if re.search(pattern, var.name): # lr_idx = i # break # if lr_idx != -1: # break # # if lr_idx == -1: # unfound pattern # lr = coefficients["lr_t"] # # print(">>>>>> DEFAULT LR: ", lr, var.name) # else: # lr = coefficients[f"lr-{lr_idx}_t"] # # print("bert LR: ", lr, var.name) m = self.get_slot(var, "m") v = self.get_slot(var, "v") if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients["beta_1_power"], coefficients["beta_2_power"], lr, # coefficients['lr_t'], coefficients["beta_1_t"], coefficients["beta_2_t"], coefficients["epsilon"], grad, use_locking=self._use_locking, ) else: vhat = self.get_slot(var, "vhat") return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients["beta_1_power"], coefficients["beta_2_power"], lr, # coefficients['lr_t'], coefficients["beta_1_t"], coefficients["beta_2_t"], coefficients["epsilon"], grad, use_locking=self._use_locking, )
def _resource_apply_dense( self, grad, var, apply_state=None, ): (var_device, var_dtype) = (var.device, var.dtype.base_dtype) coefficients = (apply_state or {}).get((var_device, var_dtype)) \ or self._fallback_apply_state(var_device, var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if self.initiation_dict[var.name] == 1: coefficients['lr_t'] = coefficients['lr_t'] * self.param_lrs[ var.name] self.initiation_dict[var.name] = 0 if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking, ) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking, )
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') lr_t = coefficients['lr_t'] for k in self._lrm_names: if var.name.startswith(k): lr_t = coefficients['lr_t'] * self._get_hyper( f'lrm_{k}', var.dtype) if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], lr_t, coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], lr_t, coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = (apply_state or {}).get( (var_device, var_dtype)) or self._fallback_apply_state( var_device, var_dtype) m = self.get_slot(var, "m") v = self.get_slot(var, "v") lr = coefficients["lr_t"] if str(var.name).find("transpose") != -1: lr = constant( self._serialize_hyperparameter("learning_rate_deconv")) if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients["beta_1_power"], coefficients["beta_2_power"], lr, coefficients["beta_1_t"], coefficients["beta_2_t"], coefficients["epsilon"], grad, use_locking=self._use_locking, ) else: vhat = self.get_slot(var, "vhat") return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients["beta_1_power"], coefficients["beta_2_power"], lr, coefficients["beta_1_t"], coefficients["beta_2_t"], coefficients["epsilon"], grad, use_locking=self._use_locking, )
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr_t[var_dtype] m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon_t = ops.convert_to_tensor(self.epsilon, var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon_t, grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon_t, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, constraint, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if not self.amsgrad: var_update = training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') var_update = training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr_t'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) project_var, was_projected = constraint.euclidean_project(var) return state_ops.assign(var, project_var)
def _resource_apply_dense(self, grad, var): var_dtype = var.dtype.base_dtype lr_t = self._decayed_lr(var_dtype) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') beta_1_t = self._get_hyper('beta_1', var_dtype) beta_2_t = self._get_hyper('beta_2', var_dtype) epsilon = self._get_hyper('epsilon', var_dtype) local_step = math_ops.cast(self.iterations + 1, var_dtype) beta_1_power = math_ops.pow(beta_1_t, local_step) beta_2_power = math_ops.pow(beta_2_t, local_step) if not self._amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon, grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, beta_1_power, beta_2_power, lr_t, beta_1_t, beta_2_t, epsilon, grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get((var_device, var_dtype)) or self._fallback_apply_state(var_device, var_dtype)) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], # coefficients['lr_t'], # replaced by next coefficients['lr_t']*self.lr_with_layer(var), coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], # coefficients['lr_t'],# replaced by next coefficients['lr_t'] * self.lr_wide_layer(var), coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): var_device, var_dtype = var.device, var.dtype.base_dtype coefficients = ((apply_state or {}).get( (var.name, var_device, var_dtype)) or self._prepare_local( var.name, var_device, var_dtype, apply_state)) m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], coefficients['lr'], coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking)
def _resource_apply_dense(self, grad, var, apply_state=None): # print("Dense: {} {} {}".format(var.name, var.device, var.dtype.base_dtype)) lr_t, _, coefficients, kwargs = self._get_lr(var, apply_state) decay = self._decay_weights_op(var, lr_t, apply_state) with tf.control_dependencies([decay]): m = self.get_slot(var, 'm') v = self.get_slot(var, 'v') if not self.amsgrad: return training_ops.resource_apply_adam( var.handle, m.handle, v.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], lr_t, coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking) else: vhat = self.get_slot(var, 'vhat') return training_ops.resource_apply_adam_with_amsgrad( var.handle, m.handle, v.handle, vhat.handle, coefficients['beta_1_power'], coefficients['beta_2_power'], lr_t, coefficients['beta_1_t'], coefficients['beta_2_t'], coefficients['epsilon'], grad, use_locking=self._use_locking)