def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, dtype = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=dtype, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=dtype, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
def call(self, inputs): """如果是条件Layer Norm,则默认以list为输入,第二个是condition """ if self.conditional: inputs, cond = inputs if self.hidden_units is not None: cond = self.hidden_dense(cond) for _ in range(K.ndim(inputs) - K.ndim(cond)): cond = K.expand_dims(cond, 1) if self.center: beta = self.beta_dense(cond) + self.beta if self.scale: gamma = self.gamma_dense(cond) + self.gamma else: if self.center: beta = self.beta if self.scale: gamma = self.gamma outputs = inputs if self.center: mean = K.mean(outputs, axis=-1, keepdims=True) outputs = outputs - mean if self.scale: variance = K.mean(K.square(outputs), axis=-1, keepdims=True) std = K.sqrt(variance + self.epsilon) outputs = outputs / std outputs = outputs * gamma if self.center: outputs = outputs + beta return outputs
def _resource_apply(self, grad, var, indices=None): lr = self.learning_rate g2 = K.square(grad) + self.epsilon1 shape = K.int_shape(var) factored_shape = self.factored_shape(shape) if factored_shape is None: v = self.get_slot(var, 'v') # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 v_t = K.update(v, v_t) else: shape1, axis1, shape2, axis2 = factored_shape vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = grad / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: m = self.get_slot(var, 'm') # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u u = K.update(m, m_t) # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2) # 更新参数 return K.update(var, var - lr * u)