Beispiel #1
0
    def _resource_apply(self, grad, var, indices=None):
        # 准备变量
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = K.cast(self.epsilon, var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)
        beta_1_t_power = K.pow(beta_1_t, local_step)
        beta_2_t_power = K.pow(beta_2_t, local_step)

        # 更新公式
        if indices is None:
            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2)
        else:
            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
            with tf.control_dependencies(mv_ops):
                m_t = self._resource_scatter_add(m, indices,
                                                 (1 - beta_1_t) * grad)
                v_t = self._resource_scatter_add(v, indices,
                                                 (1 - beta_2_t) * grad**2)

        # 返回算子
        with tf.control_dependencies([m_t, v_t]):
            if self.bias_correction:
                m_t = m_t / (1.0 - beta_1_t_power)
                v_t = v_t / (1.0 - beta_2_t_power)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            return K.update(var, var_t)
Beispiel #2
0
    def call(self, inputs):
        """如果是条件Layer Norm,则默认以list为输入,第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma

        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta

        return outputs
Beispiel #3
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]
        self.weights = [self.iterations]
        lr = self.learning_rate

        for i, (p, g) in enumerate(zip(params, grads)):
            g2 = K.square(g) + self.epsilon1
            shape, dtype = K.int_shape(p), K.dtype(p)
            factored_shape = self.factored_shape(shape)
            if factored_shape is None:
                # 定义参数
                v = K.zeros(shape, dtype=dtype, name='v_' + str(i))
                self.weights.append(v)
                # 定义更新
                v_t = self.beta2 * v + (1.0 - self.beta2) * g2
                self.updates.append(K.update(v, v_t))
            else:
                # 定义参数
                shape1, axis1, shape2, axis2 = factored_shape
                vr = K.zeros(shape1, dtype=dtype, name='vr_' + str(i))
                vc = K.zeros(shape2, dtype=dtype, name='vc_' + str(i))
                self.weights.extend([vr, vc])
                # 定义更新
                vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
                vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
                self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)])
                # 合成矩阵
                v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
            # 增量主体
            u = g / K.sqrt(v_t)
            # 增量裁剪
            if self.clipping_threshold is not None:
                u_rms = K.mean(K.sum(K.square(u)))
                d = self.clipping_threshold
                u = u / K.maximum(1.0, u_rms / d)
            # 增量滑动
            if self.beta1 > 0.0:
                # 定义参数
                m = K.zeros(shape, dtype=dtype, name='m_' + str(i))
                self.weights.append(m)
                # 定义更新
                m_t = self.beta1 * m + (1.0 - self.beta1) * u
                self.updates.append(K.update(m, m_t))
                u = m_t
            # 增量调整
            if self.multiply_by_parameter_scale:
                u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2)
            # 更新参数
            self.updates.append(K.update(p, p - lr * u))

        return self.updates
Beispiel #4
0
 def learning_rate(self):
     if self._learning_rate is None:
         iterations = K.cast(self.iterations + 1, K.floatx())
         learning_rate = K.minimum(1.0 / K.sqrt(iterations), 0.01)
         if self.multiply_by_parameter_scale:
             return learning_rate
         else:
             return learning_rate * 0.05
     else:
         if not hasattr(self, '__learning_rate'):
             with K.name_scope(self.__class__.__name__):
                 self.__learning_rate = K.variable(self._learning_rate,
                                                   name='learning_rate')
         return self.__learning_rate
Beispiel #5
0
 def _resource_apply(self, grad, var, indices=None):
     lr = self.learning_rate
     g2 = K.square(grad) + self.epsilon1
     shape = K.int_shape(var)
     factored_shape = self.factored_shape(shape)
     if factored_shape is None:
         v = self.get_slot(var, 'v')
         # 定义更新
         v_t = self.beta2 * v + (1.0 - self.beta2) * g2
         v_t = K.update(v, v_t)
     else:
         shape1, axis1, shape2, axis2 = factored_shape
         vr = self.get_slot(var, 'vr')
         vc = self.get_slot(var, 'vc')
         # 定义更新
         vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True)
         vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True)
         vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t)
         # 合成矩阵
         v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True)
     # 增量主体
     u = grad / K.sqrt(v_t)
     # 增量裁剪
     if self.clipping_threshold is not None:
         u_rms = K.mean(K.sum(K.square(u)))
         d = self.clipping_threshold
         u = u / K.maximum(1.0, u_rms / d)
     # 增量滑动
     if self.beta1 > 0.0:
         m = self.get_slot(var, 'm')
         # 定义更新
         m_t = self.beta1 * m + (1.0 - self.beta1) * u
         u = K.update(m, m_t)
     # 增量调整
     if self.multiply_by_parameter_scale:
         u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2)
     # 更新参数
     return K.update(var, var - lr * u)