def _apply_dense(self, grad, var, state): rms = state.get_slot(var, "rms") mom = state.get_slot(var, "momentum") if self._centered: mg = state.get_slot(var, "mg") return training_ops.apply_centered_rms_prop( var, mg, rms, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("rho", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), # epsilon is now the rms initial value and is not added to the # denominator anymore, hence calling the kernel op with epsilon=0. 0, grad, use_locking=self._use_locking).op else: return training_ops.apply_rms_prop( var, rms, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("rho", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), 0, grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") if self._centered: mg = self.get_slot(var, "mg") return training_ops.apply_centered_rms_prop( var, mg, rms, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), math_ops.cast(self._decay_tensor, var.dtype.base_dtype), math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking).op else: return training_ops.apply_rms_prop( var, rms, mom, math_ops.cast(self._learning_rate_tensor, var.dtype.base_dtype), math_ops.cast(self._decay_tensor, var.dtype.base_dtype), math_ops.cast(self._momentum_tensor, var.dtype.base_dtype), math_ops.cast(self._epsilon_tensor, var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, grad, var, state): rms = state.get_slot(var, "rms") mom = state.get_slot(var, "momentum") if self._centered: mg = state.get_slot(var, "mg") return training_ops.apply_centered_rms_prop( var, mg, rms, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), state.get_hyper("epsilon", var.dtype.base_dtype), grad, use_locking=self._use_locking).op else: return training_ops.apply_rms_prop( var, rms, mom, state.get_hyper("learning_rate", var.dtype.base_dtype), state.get_hyper("decay", var.dtype.base_dtype), state.get_hyper("momentum", var.dtype.base_dtype), state.get_hyper("epsilon", var.dtype.base_dtype), grad, use_locking=self._use_locking).op
def _apply_dense(self, variable, grad): rms = self._get_slot(variable, 'rms') momentum = self._get_slot(variable, 'momentum') return training_ops.apply_rms_prop(variable, rms, momentum, self._learning_rate_tensor, self._decay_tensor, self._momentum_tensor, self._epsilon_tensor, grad, use_locking=False).op
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") return training_ops.apply_rms_prop( var, rms, mom, self._learning_rate_tensor, self._decay_tensor, self._momentum_tensor, self._epsilon_tensor, grad, use_locking=self._use_locking).op
def applyDense(self, grad, var): rms = self.getSlot(var, "rms") mom = self.getSlot(var, "momentum") return training_ops.apply_rms_prop(var, rms, mom, self.learningRateTensor, self.decayTensor, self.momentumTensor, self.epsilonTensor, grad, use_locking=False).op
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") #Ex - <tf.Variable 'navigation/W_fc/RMSPropApplier:0' shape=(8192, 512) dtype=float32_ref> mom = self.get_slot(var, "momentum") #get the momentrom slots #ms <- rho * ms_{t-1} + (1-rho) * grad * grad mom <- momentum * mom_{t-1} + lr * grad / sqrt(ms + epsilon) var <- var - mom return training_ops.apply_rms_prop( #we apply the gradiets directly to the master var, rms, mom, self._learning_rate_tensor, self._decay_tensor, self._momentum_tensor, self._epsilon_tensor, grad, use_locking=False).op
def _apply_dense(self, grad, var): """ 应用梯度以及动量, 注意没有考虑到 local AC 之间的同步问题. TODO: 后期需要加上多线程同步 :param grad: :param var: :return: """ rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") return training_ops.apply_rms_prop(var, rms, mom, self._learning_rate_tensor, self._decay_tensor, self._momentum_tensor, self._epsilon_tensor, grad, use_locking=False).op
def _apply_dense(self, grad, var): rms = self.get_slot(var, "rms") mom = self.get_slot(var, "momentum") eps = self.get_slot(var, 'eps') tf.summary.scalar('grad_norm', tf.norm(grad)) # debug_here() if 'orthogonal_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("orthogonal_update"): print('Appling an orthogonality preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(rms) + eps) # the update should preserve orthogonality. grad_shape = tf.Tensor.get_shape(grad).as_list() # W_new_lst = [] eye = tf.eye(grad_shape[0], dtype=tf.float32) G = grad W = var # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) # A = tf.matmul(tf.transpose(G), W) - tf.matmul(tf.transpose(W), G) A = tf.matmul(G, tf.transpose(W)) - tf.matmul(W, tf.transpose(G)) cayleyDenom = eye + (self._learning_rate_tensor/2.0) * A cayleyNumer = eye - (self._learning_rate_tensor/2.0) * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op]) elif 'unitary_stiefel' in var.name and 'bias' not in var.name: with tf.variable_scope("unitary_update"): print('Appling an unitarity preserving step to', var.name) # apply the rms update rule. new_rms = self._decay_tensor * rms + (1. - self._decay_tensor) \ * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) # scale the gradient. if self._nat_grad_normalization: grad = grad / (tf.sqrt(new_rms) + eps) # do an update step, which preserves unitary structure. # checking shapes. grad_shape = tf.Tensor.get_shape(grad).as_list() assert grad_shape[0] == grad_shape[1] eye = tf.eye(grad_shape[0], dtype=tf.complex64) G = tf.complex(grad[:, :, 0], grad[:, :, 1]) W = tf.complex(var[:, :, 0], var[:, :, 1]) # Reunitarize after n steps. if self._qr_steps is not None: W = tf.cond(tf.equal(tf.mod(self._global_step_tensor, self._qr_steps), 0), lambda: self.re_unitarize(W), lambda: W) A = tf.matmul(G, tf.conj(tf.transpose(W))) \ - tf.matmul(W, tf.conj(tf.transpose(G))) # A must be skew symmetric. larning_rate_scale = tf.complex(self._learning_rate_tensor/2.0, tf.zeros_like(self._learning_rate_tensor)) cayleyDenom = eye + larning_rate_scale * A cayleyNumer = eye - larning_rate_scale * A C = tf.matmul(tf.matrix_inverse(cayleyDenom), cayleyNumer) W_new = tf.matmul(C, W) if self._debug: # self._summary_A(A) self._summary_C(C) self._summary_W(W) # debug_here() W_new_re = tf.real(W_new) W_new_img = tf.imag(W_new) W_array = tf.stack([W_new_re, W_new_img], -1) var_update_op = tf.assign(var, W_array) return tf.group(*[var_update_op, rms_assign_op]) else: # do the usual RMSprop update rms = False if rms: if 1: # tensorflow default. print('Appling standard rmsprop to', var.name) return training_ops.apply_rms_prop( var, rms, mom, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), tf.cast(self._decay_tensor, var.dtype.base_dtype), tf.cast(self._momentum_tensor, var.dtype.base_dtype), tf.cast(self._epsilon_tensor, var.dtype.base_dtype), grad, use_locking=False).op else: # My rmsprop implementation. new_rms = self._decay_tensor * rms \ + (1. - self._decay_tensor) * tf.square(grad) rms_assign_op = tf.assign(rms, new_rms) W_new = var - self._learning_rate_tensor * grad \ / (tf.sqrt(new_rms) + eps) var_update_op = tf.assign(var, W_new) return tf.group(*[var_update_op, rms_assign_op]) else: print('Appling default gradient descent to', var.name) return training_ops.apply_gradient_descent( var, tf.cast(self._learning_rate_tensor, var.dtype.base_dtype), grad, use_locking=False).op