Ejemplo n.º 1
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     i-i:   0 1 2 3 4 5 6 7 8 9 10 11 12 13 14...
     f(i-j):0 1 2 3 4 5 6 7 8 8 8  8  9   9  9 ...
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
Ejemplo n.º 2
0
    def _resource_apply(self, grad, var, indices=None):
        # 准备变量
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        epsilon_t = K.cast(self.epsilon, var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)
        beta_1_t_power = K.pow(beta_1_t, local_step)
        beta_2_t_power = K.pow(beta_2_t, local_step)

        # 更新公式

        m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
        v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * (grad - m_t)**2)

        # 返回算子
        with tf.control_dependencies([m_t, v_t]):
            if self.bias_correct:
                m_t = m_t / (1.0 - beta_1_t_power)
                v_t = v_t / (1.0 - beta_2_t_power)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + epsilon_t)
            return K.update(var, var_t)
Ejemplo n.º 3
0
def sparse_accuracy(y_true, y_pred):
    # y_true需要重新明确一下shape和dtype
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    # 计算准确率
    y_pred = K.cast(K.argmax(y_pred, axis=2), 'int32')
    return K.mean(K.cast(K.equal(y_true, y_pred), K.floatx()))
Ejemplo n.º 4
0
 def get_label_mask(self, y_true):
     """获取batch内相同label样本"""
     label = K.cast(y_true, 'int32')
     label_2 = K.reshape(label, (1, -1))
     mask = K.equal(label_2, label)
     mask = K.cast(mask, K.floatx())
     mask = mask * (1 - K.eye(K.shape(y_true)[0]))  # 排除对角线,即 i == j
     return mask
Ejemplo n.º 5
0
    def call(self, inputs, mask=None, a_mask=None, position_bias=None):
        """
        多头注意力
        :param inputs: [q, k, v, a_mask, position_bias]
        :param mask: [q_mask, v_mask],
            q_mask 对query序列进行mask,针对padding;v_mask对value序列进行mask,防止看到某些位置value,如padding
        :param a_mask: Boolean,是否对attention进行mask
        :param position_bias: type of position bias, 使用指定类型的位置编码对attention里的位置进行偏移
        :return:
        """
        q, k, v = inputs[:3]
        q_mask, v_mask, idx = None, None, 3
        if mask is not None:
            if mask[0] is not None:
                q_mask = K.cast(mask[0], K.floatx())
            if mask[2] is not None:
                v_mask = K.cast(mask[2], K.floatx())
        if a_mask is not None:
            a_mask = inputs[idx]
            idx += 1

        # 投影变换
        qw = self.q_dense(q)
        kw = self.k_dense(k)
        vw = self.v_dense(v)

        # 形状变换
        qw = K.reshape(qw, [-1, K.shape(q)[1], self.head_nums, self.key_size])
        kw = K.reshape(kw, [-1, K.shape(k)[1], self.head_nums, self.key_size])
        vw = K.reshape(vw, [-1, K.shape(v)[1], self.head_nums, self.head_size])
        # 计算attention
        att = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
        # 处理位置编码
        if position_bias == 'relative':
            position_embeddings = inputs[idx]
            att = att + tf.einsum('bjhd,jkd->bhjk', qw, position_embeddings)

        if self.attention_scale:
            att = att / self.key_size**0.5

        # value mask
        att = sequence_masking(att, v_mask, 'add', -1)
        # attention mask
        if a_mask is not None:
            att = att - (1 - a_mask) * 1e12

        att = K.softmax(att)
        output = tf.einsum('bhjk,bkhd->bjhd', att, vw)
        # 继续处理位置编码
        if position_bias == 'relative':
            output = output + tf.einsum('bhjk,jkd->bjhd', att,
                                        position_embeddings)
        output = K.reshape(output, (-1, K.shape(output)[1], self.output_dim))
        output = self.combine_dense(output)
        # query mask
        output = sequence_masking(output, q_mask, 'mul')
        return output
Ejemplo n.º 6
0
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.learning_rate
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay *
                             K.cast(self.iterations, K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='m_' + str(i))
            for (i, p) in enumerate(params)
        ]
        vs = [
            K.zeros(K.int_shape(p), dtype=K.dtype(p), name='v_' + str(i))
            for (i, p) in enumerate(params)
        ]

        if self.amsgrad:
            vhats = [
                K.zeros(K.int_shape(p),
                        dtype=K.dtype(p),
                        name='vhat_' + str(i)) for (i, p) in enumerate(params)
            ]
        else:
            vhats = [
                K.zeros(1, name='vhat_' + str(i)) for i in range(len(params))
            ]
        self.weights = [self.iterations] + ms + vs + vhats

        for p, g, m, v, vhat in zip(params, grads, ms, vs, vhats):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g - m_t)
            if self.amsgrad:
                vhat_t = K.maximum(vhat, v_t)
                p_t = p - lr_t * m_t / (K.sqrt(vhat_t) + self.epsilon)
                self.updates.append(K.update(vhat, vhat_t))
            else:
                p_t = p - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates
Ejemplo n.º 7
0
 def sparse_accuracy(self, y_true, y_pred):
     """训练过程中显示逐帧准确率的函数,排除了mask的影响
     此处y_true需要是整数形式(非one hot)
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2)
     mask = K.cast(mask, K.floatx())
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 逐标签取最大来粗略评测训练效果
     y_pred = K.cast(K.argmax(y_pred, 2), 'int32')
     isequal = K.cast(K.equal(y_true, y_pred), K.floatx())
     return K.sum(isequal * mask) / K.sum(mask)
Ejemplo n.º 8
0
        def get_updates(self, loss, params):
            # 是否更新
            cond = K.equal(self.iterations % self.grad_accum_steps, 0)
            cond = K.cast(cond, K.floatx())
            # 获取梯度
            grads = self.get_gradients(loss, params)
            self.accum_grads = [
                K.zeros(shape=K.int_shape(p),
                        dtype=K.dtype(p),
                        name='accum_grad_{}'.format(i))
                for i, p in enumerate(params)
            ]

            old_update = K.update

            def new_update(x, new_x):
                new_x = cond * new_x + (1 - cond) * x
                return old_update(x, new_x)

            K.update = new_update
            updates = super(NewOptimizer, self).get_updates(loss, params)
            K.update = old_update

            # 累计更新
            with K.control_dependencies(updates):
                acc_updates = [
                    K.update(ag, g + (1 - cond) * ag)
                    for ag, g in zip(self.accum_grads, grads)
                ]

            return acc_updates
Ejemplo n.º 9
0
    def _resource_apply(self, grad, var, indices=None):
        var_dtype = var.dtype.base_dtype
        lr_t = self._decayed_lr(var_dtype)
        m = self.get_slot(var, 'm')
        v = self.get_slot(var, 'v')
        beta_1_t = self._get_hyper('beta_1', var_dtype)
        beta_2_t = self._get_hyper('beta_2', var_dtype)
        local_step = K.cast(self.iterations + 1, var_dtype)
        beta_1_power = K.pow(beta_1_t, local_step)
        beta_2_power = K.pow(beta_2_t, local_step)

        # update
        if indices is None:
            m_t = K.update(m, beta_1_t * m + (1 - beta_1_t) * grad)
            v_t = K.update(v, beta_2_t * v + (1 - beta_2_t) * grad**2)
        else:
            mv_ops = [K.update(m, beta_1_t * m), K.update(v, beta_2_t * v)]
            with tf.control_dependencies(mv_ops):
                m_t = self._resource_scatter_add(m, indices,
                                                 (1 - beta_1_t) * grad)
                v_t = self._resource_scatter_add(v, indices,
                                                 (1 - beta_2_t) * grad**2)

        #
        with tf.control_dependencies([m_t, v_t]):
            if self.bias_correct:
                m_t = m_t / (1 + beta_1_power)
                v_t = v_t / (1 + beta_2_power)
            var_t = var - lr_t * m_t / (K.sqrt(v_t) + self.epsilon)
            return K.update(var, var_t)
 def get_labels_of_similarity(self, y_pred):
     idxs = K.arange(0, K.shape(y_pred)[0])
     idxs_1 = idxs[None, :]
     idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
     labels = K.equal(idxs_1, idxs_2)
     labels = K.cast(labels, K.floatx())
     return labels
Ejemplo n.º 11
0
        def parse_func(serialized_record):
            feature_description = {
                'token_ids': tf.io.FixedLenFeature([seq_length], tf.int64),
                'mask_ids': tf.io.FixedLenFeature([seq_length], tf.int64)
            }
            features = tf.io.parse_single_example(serialized_record,
                                                  feature_description)
            token_ids = features['token_ids']
            mask_ids = features['mask_ids']
            segment_ids = K.zeros_like(token_ids, dtype='int64')
            is_masked = K.not_equal(mask_ids, 0)
            masked_token_ids = K.switch(mask_ids, mask_ids - 1,
                                        token_ids)  # 之前让位给unmask_id一位,现在减1回归

            x = {
                'Input-Token': masked_token_ids,
                'Input-Segment': segment_ids,
                'token_ids': token_ids,
                'is_masked': K.cast(is_masked, K.floatx())
            }
            y = {
                'mlm_loss': K.zeros_like([1], tf.float32),
                'mlm_acc': K.zeros_like([1], tf.float32)
            }
            return x, y
Ejemplo n.º 12
0
    def call(self, inputs):
        #     PE_2i(p) = sin(p/10000^(2i/d_pos))
        #     PE_2i+1(p) = cos(p/10000^(2i/d_pos))
        batch_size, seq_len, word_emb_dim = K.shape(inputs)[0], K.shape(
            inputs)[1], K.shape(inputs)[2]
        if not self.embedding_dim or self.method == 'add':
            self.embedding_dim = word_emb_dim
        t = 2 * K.arange(self.embedding_dim / 2, dtype='float32') / K.cast(
            self.embedding_dim, dtype='float32')
        embedding_wise_pos = 1. / K.pow(
            10000., t)  # 1/10000 ^(2i/d_pos) , shape = (p_dim/2, )
        embedding_wise_pos = K.expand_dims(embedding_wise_pos,
                                           0)  # (1, p_dim/2)
        word_wise_pos = K.cumsum(K.ones_like(inputs[:, :, 0]),
                                 axis=1)  # shape = [batch_size, seq_len]
        word_wise_pos = K.expand_dims(word_wise_pos,
                                      2)  # (batch_size, seq_len, 1)
        position_embedding = K.dot(
            word_wise_pos,
            embedding_wise_pos)  # (batch_size, seq_len, p_dim/2)

        position_embedding = K.expand_dims(position_embedding, 3)
        position_embedding = K.reshape(K.concatenate(
            [K.sin(position_embedding),
             K.cos(position_embedding)], axis=-1),
                                       shape=(batch_size, seq_len, -1))

        if self.method == 'add':
            return inputs + position_embedding

        return K.concatenate([inputs, position_embedding], axis=-1)
Ejemplo n.º 13
0
def sparse_categorical_crossentropy(y_true, y_pred):
    # y_true需要重新明确一下shape和dtype
    y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
    y_true = K.cast(y_true, 'int32')
    y_true = K.one_hot(y_true, K.shape(y_pred)[2])
    # 计算交叉熵
    return K.mean(K.categorical_crossentropy(y_true, y_pred))
Ejemplo n.º 14
0
 def get_labels_of_similarity(self, inputs):
     idx = K.arange(0, K.shape(inputs)[0])
     idx_1 = idx[None, :]
     idx_2 = (idx + 1 - idx % 2 * 2)[:, None]
     labels = K.equal(idx_1, idx_2)
     labels = K.cast(labels, K.floatx())
     return labels
Ejemplo n.º 15
0
def mlm_acc(inputs):
    """计算准确率的函数,需要封装为一个层
    """
    y_true, y_pred, mask = inputs
    y_true = K.cast(y_true, floatx)
    acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
    acc = K.sum(acc * mask) / (K.sum(mask) + K.epsilon())
    return acc
Ejemplo n.º 16
0
 def nsp_acc(inputs):
     """计算准确率的函数,需要封装为一个层
     """
     y_true, y_pred = inputs
     y_pred, _ = y_pred
     y_true = K.cast(y_true, K.floatx)
     acc = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
     acc = K.mean(acc)
     return acc
Ejemplo n.º 17
0
 def sparse_loss(self, y_true, y_pred):
     """y_true需要是整数形式(非one hot)
     """
     # y_true需要重新明确一下shape和dtype
     y_true = K.reshape(y_true, K.shape(y_pred)[:-1])
     y_true = K.cast(y_true, 'int32')
     # 转为one hot
     y_true = K.one_hot(y_true, K.shape(self.trans)[0])
     return self.dense_loss(y_true, y_pred)
 def compute_loss(self, inputs, mask=None):
     y_true, y_pred = inputs
     y_mask = K.cast(K.not_equal(y_true, 0), K.floatx())
     accuracy = keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
     accuracy = K.sum(accuracy * y_mask) / K.sum(y_mask)
     self.add_metric(accuracy, name='accuracy')
     loss = K.sparse_categorical_crossentropy(y_true, y_pred)
     loss = K.sum(loss * y_mask) / K.sum(y_mask)
     return loss
Ejemplo n.º 19
0
 def call(self, x, mask=None):
     x0 = x
     x = self.k_dense(x0)
     x = self.o_dense(x)
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         mask = K.expand_dims(mask, 2)
         x = x - (1 - mask) * 1e12
     x = K.softmax(x, 1)
     x = K.sum(x0 * x, 1)
     return x
Ejemplo n.º 20
0
    def compute_loss(self, inputs, mask=None):
        pred, ytrue = inputs
        acc = keras.metrics.sparse_categorical_accuracy(ytrue, pred)
        self.add_metric(acc, name='clf_acc')

        ytrue = K.cast(ytrue, 'int32')
        ytrue = K.one_hot(ytrue, num_classes=num_classes)
        ytrue = K.reshape(ytrue, (-1, num_classes))
        loss = ytrue * K.log(pred + K.epsilon()) + (1 - ytrue) * K.log(1 - pred + K.epsilon())
        loss = -K.mean(loss)
        loss = loss * self.alpha
        self.add_metric(loss, name='clf_loss')

        return loss
Ejemplo n.º 21
0
    def call(self, x, mask=None):
        x0 = x
        if mask is not None:
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, 2)
        #         x = x0 * mask if mask is not None else x0
        x0 = Lambda(lambda x_: x_, output_shape=lambda s: s)(x0)  # drop mask so do not put mask to conv1d
        x = self.conv1d(x0)
        x, g = x[:, :, :self.o_dim], x[:, :, self.o_dim:]
        if self.dropout_rate is not None:
            g = K.in_train_phase(K.dropout(g, self.dropout_rate), g)
        g = K.sigmoid(g)
        # mask is none
        mask = mask if mask is not None else K.ones_like(x)

        if self.skip_connection:
            if K.int_shape(x0)[-1] != self.o_dim:
                x0 = self.conv1d_1x1(x0)
            return (x0 * (1 - g) + x * g) * mask
        return x * g * mask
Ejemplo n.º 22
0
 def dense_loss(self, y_true, y_pred):
     """y_true需要是one hot形式
     """
     # 导出mask并转换数据类型
     mask = K.all(K.greater(y_pred, -1e6), axis=2, keepdims=True)
     mask = K.cast(mask, K.floatx())
     # 计算目标分数
     y_true, y_pred = y_true * mask, y_pred * mask
     target_score = self.path_score(y_pred, y_true)
     # 递归计算log Z
     init_states = [y_pred[:, 0]]
     y_pred = K.concatenate([y_pred, mask], axis=2)
     input_length = K.int_shape(y_pred[:, 1:])[1]
     log_norm, _, _ = K.rnn(self.log_norm_step,
                            y_pred[:, 1:],
                            init_states,
                            input_length=input_length)  # 最后一步的log Z向量
     log_norm = K.logsumexp(log_norm, 1)  # logsumexp得标量
     # 计算损失 -log p
     return log_norm - target_score
Ejemplo n.º 23
0
    def call(self, inputs, **kwargs):
        logits, token_seq = inputs[:2]
        seq_shape = K.shape(token_seq)
        batch_size, seq_length = seq_shape[0], seq_shape[1]
        if self.pad_token_id is None:
            sequence_lengths = [seq_length - 1] * batch_size
        else:
            sequence_lengths = (
                    K.sum(
                        K.cast(
                            K.not_equal(token_seq, self.pad_token_id),
                            dtype='int32',
                        ),
                        -1,
                        keepdims=False,
                    )
                    - 1
            )
        # only tf2
        # return tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)

        indices = K.expand_dims(sequence_lengths, -1)
        return tf.gather_nd(logits, indices, batch_dims=1)
Ejemplo n.º 24
0
 def call(self, inputs, mask=None):
     if mask is not None:
         mask = K.cast(mask, K.floatx())
         mask = K.expand_dims(mask, 2)
         inputs = inputs - (1.0 - mask) * 1e12
     return K.softmax(inputs, 1)
Ejemplo n.º 25
0
def normal_shannon_entropy(p, labels_num=num_classes):
    # normalized entropy
    p = K.cast(p, K.floatx())
    norm = K.log(1. / labels_num)
    s = K.sum(p * K.log(p), axis=-1, keepdims=True)
    return s / norm
 def compute_classification_acc(self, inputs, mask=None):
     _, _, y_pred, _, y_true = inputs
     equal = K.equal(K.cast(K.argmax(y_pred, axis=-1), 'int32'),
                     K.cast(y_true, 'int32'))
     return K.cast(equal, K.floatx()) / K.cast(
         K.shape(y_true)[0], K.floatx())
Ejemplo n.º 27
0
        def _decayed_lr(self, var_dtypes):
            """重写获取decayed learning rate 方法"""

            lr_t = super(NewOptimzer, self)._decayed_lr(var_dtypes)
            lr_rate = piecewise_linear(self.iterations, self.lr_schedule)
            return lr_t * K.cast(lr_rate, var_dtypes)
Ejemplo n.º 28
0
    def call(self, inputs, mask=None):
        # 只是计算loss,并不改变输入
        if mask is not None:
            mask = K.cast(mask, K.floatx())

        return sequence_masking(inputs, mask, 1, 1)