Ejemplo n.º 1
0
        def _resource_apply_op(self, grad, var, indices=None):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            # 获取梯度
            ag = self.get_slot(var, 'ag')

            old_update = K.update

            def new_update(x, new_x):
                new_x = K.switch(cond, new_x, x)
                return old_update(x, new_x)

            K.update = new_update
            ag_t = ag / self.grad_accum_steps
            op = super(new_optimizer, self)._resource_apply_op(ag_t, var)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies([op]):
                ag_t = K.switch(cond, K.zeros_like(ag), ag)
                with tf.control_dependencies([K.update(ag, ag_t)]):
                    if indices is None:
                        ag_t = K.update(ag, ag + grad)
                    else:
                        ag_t = self._resource_scatter_add(ag, indices, grad)

            return ag_t
Ejemplo n.º 2
0
    def _resource_apply_op(self, grad, var, indices=None):
        # 准备变量
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = K.cast(self.epsilon, var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)
        beta_1_t_power = K.pow(beta_1_t, local_step)
        beta_2_t_power = K.pow(beta_2_t, local_step)

        # 更新公式
        if indices is None:
            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2)
        else:
            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
            with tf.control_dependencies(mv_ops):
                m_t = self._resource_scatter_add(m, indices,
                                                 (1 - beta_1_t) * grad)
                v_t = self._resource_scatter_add(v, indices,
                                                 (1 - beta_2_t) * grad**2)

        # 返回算子
        with tf.control_dependencies([m_t, v_t]):
            if self.bias_correction:
                m_t = m_t / (1. - beta_1_t_power)
                v_t = v_t / (1. - beta_2_t_power)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            return K.update(var, var_t)
Ejemplo n.º 3
0
 def sparse_loss(self, y_true, y_pred):
     """y_true需要是整数形式(非one hot)
     """
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 转为one hot
     y_true = K.one_hot(y_true, K.shape(self.trans)[0])
     return self.dense_loss(y_true, y_pred)
Ejemplo n.º 4
0
    def build(self, input_shape):
        output_dim = input_shape[-1]
        if not isinstance(output_dim, int):
            output_dim = output_dim.value

        if self.hidden_dim is None:
            self.trans = self.add_weight(name='trans',
                                         shape=(output_dim, output_dim),
                                         initializer='glorot_uniform',
                                         trainable=True)
            if self.lr_multiplier != 1:
                K.set_value(self.trans,
                            K.eval(self.trans) / self.lr_multiplier)
                self.trans = self.lr_multiplier * self.trans
        else:
            self.l_trans = self.add_weight(name='l_trans',
                                           shape=(output_dim, self.hidden_dim),
                                           initializer='glorot_uniform',
                                           trainable=True)
            self.r_trans = self.add_weight(name='r_trans',
                                           shape=(output_dim, self.hidden_dim),
                                           initializer='glorot_uniform',
                                           trainable=True)

            if self.lr_multiplier != 1:
                K.set_value(self.l_trans,
                            K.eval(self.l_trans) / self.lr_multiplier)
                self.l_trans = self.lr_multiplier * self.l_trans
                K.set_value(self.r_trans,
                            K.eval(self.r_trans) / self.lr_multiplier)
                self.r_trans = self.lr_multiplier * self.r_trans
Ejemplo n.º 5
0
    def call(self, inputs):
        input_shape = K.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]
        pos_embeddings = self.embeddings[:seq_len]
        pos_embeddings = K.expand_dims(pos_embeddings, 0)

        if self.merge_mode == 'add':
            return inputs + pos_embeddings
        else:
            pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1])
            return K.concatenate([inputs, pos_embeddings])
Ejemplo n.º 6
0
 def new_update(x, new_x):
     if is_one_of(x, params) and self._do_layer_adaptation(x):
         dx = new_x - x
         lr_t = K.clip(self.learning_rate, K.epsilon(), 1e10)
         x_norm = tf.norm(x)
         g_norm = tf.norm(dx / lr_t)
         ratio = K.switch(
             x_norm > 0.,
             K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.),
             1.)
         new_x = x + dx * ratio
     return old_update(x, new_x)
Ejemplo n.º 7
0
 def compile(self):
     # 交叉熵作为loss,并mask掉输入部分的预测
     y_true = self.model.input[0][:, 1:]  # 目标tokens
     y_mask = self.model.input[1][:, 1:]  # 目标mask
     y_mask = K.cast(y_mask, K.floatx())  # 转为浮点型
     y_pred = self.model.output[:, :-1]  # 预测tokens,预测与目标错开一位
     cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
     cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
     self.model.add_loss(cross_entropy)
     opt = extend_with_gradient_accumulation(Adam)(learning_rate=0.000015,
                                                   grad_accum_steps=2)
     self.model.compile(optimizer=opt)
Ejemplo n.º 8
0
 def new_update(x, new_x):
     if x is var and self._do_lazy_optimization(x):
         if indices is None:
             r = K.any(K.not_equal(grad, 0.),
                       axis=-1,
                       keepdims=True)
             new_x = x + (new_x - x) * K.cast(r, K.floatx())
             return old_update(x, new_x)
         else:
             return self._resource_scatter_add(
                 x, indices, K.gather(new_x - x, indices))
     return old_update(x, new_x)
Ejemplo n.º 9
0
 def log_norm_step(self, inputs, states):
     """递归计算归一化因子
     要点:1、递归计算;2、用logsumexp避免溢出。
     """
     inputs, mask = inputs[:, :-1], inputs[:, -1:]
     states = K.expand_dims(states[0], 2)  # (batch_size, output_dim, 1)
     trans = K.expand_dims(self.trans, 0)  # (1, output_dim, output_dim)
     outputs = tf.reduce_logsumexp(states + trans,
                                   1)  # (batch_size, output_dim)
     outputs = outputs + inputs
     outputs = mask * outputs + (1 - mask) * states[:, :, 0]
     return outputs, [outputs]
Ejemplo n.º 10
0
 def new_update(x, new_x):
     if x is var and self._do_layer_adaptation(x):
         dx = new_x - x
         lr_t = self._decayed_lr(x.dtype.base_dtype)
         lr_t = K.clip(lr_t, K.epsilon(), 1e10)
         x_norm = tf.norm(x)
         g_norm = tf.norm(dx / lr_t)
         ratio = K.switch(
             x_norm > 0.,
             K.switch(g_norm > K.epsilon(), x_norm / g_norm, 1.),
             1.)
         new_x = x + dx * ratio
     return old_update(x, new_x)
Ejemplo n.º 11
0
 def masked_cross_entropy(y_true, y_pred):
     """交叉熵作为loss,并mask掉padding部分的预测
     """
     y_true = K.reshape(y_true, [K.shape(y_true)[0], -1])
     y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
     cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred)
     cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask)
     return cross_entropy
Ejemplo n.º 12
0
 def __init__(self,
              center=True,
              scale=True,
              conditional=False,
              hidden_units=None,
              hidden_activation='linear',
              hidden_initializer='glorot_uniform',
              **kwargs):
     super(LayerNormalization, self).__init__(**kwargs)
     self.center = center
     self.scale = scale
     self.conditional = conditional
     self.hidden_units = hidden_units
     self.hidden_activation = activations.get(hidden_activation)
     self.hidden_initializer = initializers.get(hidden_initializer)
     self.epsilon = K.epsilon() * K.epsilon()
Ejemplo n.º 13
0
        def _resource_apply_op(self, grad, var, indices=None):
            op = super(new_optimizer,
                       self)._resource_apply_op(grad, var, indices)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_var = self.get_slot(var, 'slow_var')
            slow_var_t = slow_var + alpha * (var - slow_var)

            with tf.control_dependencies([op]):
                slow_update = K.update(slow_var,
                                       K.switch(cond, slow_var_t, slow_var))
                with tf.control_dependencies([slow_update]):
                    copy_update = K.update(var, K.switch(cond, slow_var, var))

            return copy_update
Ejemplo n.º 14
0
    def call(self, inputs):
        """如果是条件Layer Norm,则默认以list为输入,第二个是condition
        """
        if self.conditional:
            inputs, cond = inputs
            if self.hidden_units is not None:
                cond = self.hidden_dense(cond)
            for _ in range(K.ndim(inputs) - K.ndim(cond)):
                cond = K.expand_dims(cond, 1)
            if self.center:
                beta = self.beta_dense(cond) + self.beta
            if self.scale:
                gamma = self.gamma_dense(cond) + self.gamma
        else:
            if self.center:
                beta = self.beta
            if self.scale:
                gamma = self.gamma

        outputs = inputs
        if self.center:
            mean = K.mean(outputs, axis=-1, keepdims=True)
            outputs = outputs - mean
        if self.scale:
            variance = K.mean(K.square(outputs), axis=-1, keepdims=True)
            std = K.sqrt(variance + self.epsilon)
            outputs = outputs / std
            outputs = outputs * gamma
        if self.center:
            outputs = outputs + beta

        return outputs
Ejemplo n.º 15
0
        def get_updates(self, loss, params):
            # 更新判据
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_%s' % i) for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(new_optimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累积梯度
            with tf.control_dependencies(updates):
                accum_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for g, ag in zip(grads, self.accum_grads)
                ]

            return accum_updates
Ejemplo n.º 16
0
    def call(self, inputs):
        if not hasattr(self, 'kernel'):
            embedding_layer = search_layer(inputs, self.embedding_name)
            if embedding_layer is None:
                raise Exception('Embedding layer not found')

            self.kernel = K.transpose(embedding_layer.embeddings)
        self.units = K.int_shape(self.kernel)[1]  # <<<
        if not hasattr(self, 'bias') and self.use_bias:  # <<<
            self.bias = self.add_weight(name='bias',
                                        shape=(self.units, ),
                                        initializer='zeros')

        outputs = K.dot(inputs, self.kernel)
        if self.use_bias:
            outputs = K.bias_add(outputs, self.bias)
        outputs = self.activation(outputs)
        return outputs
Ejemplo n.º 17
0
 def train_function(inputs):  # 重新定义训练函数
     grads = embedding_gradients(inputs)[0]  # Embedding梯度
     delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
     K.set_value(embeddings, K.eval(embeddings) + delta)  # 注入扰动
     outputs = old_train_function(inputs)  # 梯度下降
     K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
     return outputs
Ejemplo n.º 18
0
def adversarial_training(model, embedding_name, epsilon=1):
    """给模型添加对抗训练
    其中model是需要添加对抗训练的keras模型,embedding_name
    则是model里边Embedding层的名字。要在模型compile之后使用。
    """
    if model.train_function is None:  # 如果还没有训练函数
        model._make_train_function()  # 手动make
    old_train_function = model.train_function  # 备份旧的训练函数

    # 查找Embedding层
    for output in model.outputs:
        embedding_layer = search_layer(output, embedding_name)
        if embedding_layer is not None:
            break
    if embedding_layer is None:
        raise Exception('Embedding layer not found')

    # 求Embedding梯度
    embeddings = embedding_layer.embeddings  # Embedding矩阵
    gradients = K.gradients(model.total_loss, [embeddings])  # Embedding梯度
    gradients = K.zeros_like(embeddings) + gradients[0]  # 转为dense tensor

    # 封装为函数
    inputs = (model._feed_inputs + model._feed_targets +
              model._feed_sample_weights)  # 所有输入层
    embedding_gradients = K.function(
        inputs=inputs,
        outputs=[gradients],
        name='embedding_gradients',
    )  # 封装为函数

    def train_function(inputs):  # 重新定义训练函数
        grads = embedding_gradients(inputs)[0]  # Embedding梯度
        delta = epsilon * grads / (np.sqrt((grads**2).sum()) + 1e-8)  # 计算扰动
        K.set_value(embeddings, K.eval(embeddings) + delta)  # 注入扰动
        outputs = old_train_function(inputs)  # 梯度下降
        K.set_value(embeddings, K.eval(embeddings) - delta)  # 删除扰动
        return outputs

    model.train_function = train_function  # 覆盖原训练函数
Ejemplo n.º 19
0
 def compute_position_ids(self, inputs):
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     max_position = (self.input_dim - 1) // 2
     pos_ids = K.clip(pos_ids, -max_position, max_position)
     pos_ids = pos_ids + max_position
     return pos_ids
Ejemplo n.º 20
0
 def __init__(self,
              learning_rate=0.001,
              beta_1=0.9,
              beta_2=0.999,
              epsilon=1e-6,
              bias_correction=True,
              name='Adam',
              **kwargs):
     kwargs['name'] = name
     super(Adam, self).__init__(**kwargs)
     self._set_hyper('learning_rate', learning_rate)
     self._set_hyper('beta_1', beta_1)
     self._set_hyper('beta_2', beta_2)
     self.epsilon = epsilon or K.epislon()
     self.bias_correction = bias_correction
Ejemplo n.º 21
0
 def set_model(self, model):
     """绑定模型,并初始化参数
     """
     super(ExponentialMovingAverage, self).set_model(model)
     self.ema_weights = [K.zeros(K.shape(w)) for w in model.weights]
     self.old_weights = K.batch_get_value(model.weights)
     K.batch_set_value(zip(self.ema_weights, self.old_weights))
     self.updates = []
     for w1, w2 in zip(self.ema_weights, model.weights):
         op = K.moving_average_update(w1, w2, self.momentum)
         self.updates.append(op)
Ejemplo n.º 22
0
        def get_updates(self, loss, params):
            updates = super(new_optimizer, self).get_updates(loss, params)

            k, alpha = self.steps_per_slow_update, self.slow_step_size
            cond = K.equal(self.iterations % k, 0)
            slow_vars = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='slow_var_%s' % i) for i, p in enumerate(params)
            ]

            with tf.control_dependencies(updates):
                slow_updates = [
                    K.update(q, K.switch(cond, q + alpha * (p - q), q))
                    for p, q in zip(params, slow_vars)
                ]
                with tf.control_dependencies(slow_updates):
                    copy_updates = [
                        K.update(p, K.switch(cond, q, p))
                        for p, q in zip(params, slow_vars)
                    ]

            return copy_updates
Ejemplo n.º 23
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     if self.input_mask is None:
         mask = None
     else:
         mask = K.cast(self.input_mask, K.floatx())
     # 计算目标分数
     target_score = self.target_score(y_true, y_pred, mask)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     if mask is None:
         mask = K.ones_like(y_pred[:, :, :1])
     else:
         mask = K.expand_dims(mask, 2)
     y_pred = K.concatenate([y_pred, mask])
     log_norm, _, _ = K.rnn(self.log_norm_step, y_pred[:, 1:],
                            init_states)  # 最后一步的log Z向量
     log_norm = tf.reduce_logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
Ejemplo n.º 24
0
 def call(self, inputs, mask=None, a_mask=None, p_bias=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     p_bias: 在attention里的位置偏置。
             一般用来指定相对位置编码的种类。
     """
     q, k, v = inputs[:3]
     q_mask, v_mask, n = None, None, 3
     if mask is not None:
         if mask[0] is not None:
             q_mask = K.cast(mask[0], K.floatx())
         if mask[2] is not None:
             v_mask = K.cast(mask[2], K.floatx())
     if a_mask:
         if len(inputs) == 3:
             a_mask = 'history_only'
         else:
             a_mask = inputs[n]
             n += 1
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 处理位置编码
     if p_bias == 'typical_relative':
         pos_embeddings = inputs[n]
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     elif p_bias == 't5_relative':
         pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1))
         a = a + K.expand_dims(pos_embeddings, 0)
     # Attention(续)
     if p_bias != 't5_relative':  # T5不用缩放
         a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if p_bias == 'typical_relative':
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o
Ejemplo n.º 25
0
 def dense_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是one hot形式
     """
     y_true = K.argmax(y_true, 2)
     return self.sparse_accuracy(y_true, y_pred)
Ejemplo n.º 26
0
 def basic_accuracy(self, y_true, y_pred, go_backwards=False):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     if self.input_mask is None:
         mask = None
     else:
         mask = K.cast(self.input_mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算逐标签accuracy
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     if mask is None:
         return K.mean(isequal)
     else:
         return K.sum(isequal * mask) / K.sum(mask)
Ejemplo n.º 27
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     y_true = K.argmax(y_true, 2)
     return self.sparse_loss(y_true, y_pred)
Ejemplo n.º 28
0
 def basic_loss(self, y_true, y_pred, go_backwards=False):
     """y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     if self.input_mask is None:
         mask = None
     else:
         mask = K.cast(self.input_mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 反转相关
     if self.hidden_dim is None:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             trans = K.transpose(self.trans)
         else:
             trans = self.trans
         histoty = K.gather(trans, y_true)
     else:
         if go_backwards:  # 是否反转序列
             y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask)
             r_trans, l_trans = self.l_trans, self.r_trans
         else:
             l_trans, r_trans = self.l_trans, self.r_trans
         histoty = K.gather(l_trans, y_true)
         histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans)
     # 计算loss
     histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1)
     y_pred = (y_pred + histoty) / 2
     loss = K.sparse_categorical_crossentropy(y_true,
                                              y_pred,
                                              from_logits=True)
     if mask is None:
         return K.mean(loss)
     else:
         return K.sum(loss * mask) / K.sum(mask)
Ejemplo n.º 29
0
 def reverse_sequence(self, inputs, mask=None):
     if mask is None:
         return [x[:, ::-1] for x in inputs]
     else:
         length = K.cast(K.sum(mask, 1), 'int32')
         return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
Ejemplo n.º 30
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     if self.input_mask is None:
         mask = None
     else:
         mask = K.cast(self.input_mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     if mask is None:
         return K.mean(isequal)
     else:
         return K.sum(isequal * mask) / K.sum(mask)