예제 #1
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     对所有模型使用 32 个嵌入,其数值范围的大小以对数方式增加,最大偏移量为128,超过此偏移量,所有相对位置使用同一嵌入。
     需要注意的是,某一给定层对超过 128 的相对位置不敏感,但是后续层可以通过组合来自先前层的局部信息来建立对更大偏移的敏感性。
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
예제 #2
0
 def compute_position_ids(self, inputs):
     """T5的相对位置分桶(直接翻译自官方T5源码)
     """
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     num_buckets, max_distance = self.input_dim, self.max_distance
     ret = 0
     n = -pos_ids
     if self.bidirectional:
         num_buckets //= 2
         ret += K.cast(K.less(n, 0), 'int32') * num_buckets
         n = K.abs(n)
     else:
         n = K.maximum(n, 0)
     # now n is in the range [0, inf)
     max_exact = num_buckets // 2
     is_small = K.less(n, max_exact)
     val_if_large = max_exact + K.cast(
         K.log(K.cast(n, K.floatx()) / max_exact) /
         np.log(max_distance / max_exact) * (num_buckets - max_exact),
         'int32',
     )
     val_if_large = K.minimum(val_if_large, num_buckets - 1)
     ret += K.switch(is_small, n, val_if_large)
     return ret
예제 #3
0
    def call(self, inputs):
        """如果custom_position_ids,那么第二个输入为自定义的位置id
        """
        input_shape = K.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]

        if self.custom_position_ids:
            inputs, position_ids = inputs
        else:
            position_ids = K.arange(0, seq_len, dtype=K.floatx())[None]

        indices = K.arange(0, self.output_dim // 2, dtype=K.floatx())
        indices = K.pow(10000.0, -2 * indices / self.output_dim)
        pos_embeddings = tf.einsum('bn,d->bnd', position_ids, indices)
        pos_embeddings = K.concatenate([
            K.sin(pos_embeddings)[..., None],
            K.cos(pos_embeddings)[..., None]
        ])
        pos_embeddings = K.reshape(
            pos_embeddings, (-1, seq_len, self.output_dim)
        )

        if self.merge_mode == 'add':
            return inputs + pos_embeddings
        elif self.merge_mode == 'mul':
            return inputs * pos_embeddings
        else:
            if not self.custom_position_ids:
                pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1])
            return K.concatenate([inputs, pos_embeddings])
예제 #4
0
    def call(self, inputs):
        """如果custom_position_ids,那么第二个输入为自定义的位置id
        """
        if self.custom_position_ids:
            seq_len = K.shape(inputs)[1]
            inputs, position_ids = inputs
            if 'float' not in K.dtype(position_ids):
                position_ids = K.cast(position_ids, K.floatx())
        else:
            input_shape = K.shape(inputs)
            batch_size, seq_len = input_shape[0], input_shape[1]
            position_ids = K.arange(0, seq_len, dtype=K.floatx())[None]

        indices = K.arange(0, self.output_dim // 2, dtype=K.floatx())
        indices = K.pow(10000.0, -2 * indices / self.output_dim)
        embeddings = tf.einsum('bn,d->bnd', position_ids, indices)
        embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1)
        embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim))

        if self.merge_mode == 'add':
            return inputs + embeddings
        elif self.merge_mode == 'mul':
            return inputs * (embeddings + 1.0)
        else:
            if not self.custom_position_ids:
                embeddings = K.tile(embeddings, [batch_size, 1, 1])
            return K.concatenate([inputs, embeddings])
예제 #5
0
 def compute_position_ids(self, inputs):
     q, v = inputs
     # 计算位置差
     q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
     q_idxs = K.expand_dims(q_idxs, 1)
     v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
     v_idxs = K.expand_dims(v_idxs, 0)
     pos_ids = v_idxs - q_idxs
     # 后处理操作
     max_position = (self.input_dim - 1) // 2
     pos_ids = K.clip(pos_ids, -max_position, max_position)
     pos_ids = pos_ids + max_position
     return pos_ids
예제 #6
0
    def call(self, inputs):
        """如果custom_position_ids,那么第二个输入为自定义的位置id
        """
        input_shape = K.shape(inputs)
        batch_size, seq_len = input_shape[0], input_shape[1]

        if self.custom_position_ids:
            inputs, position_ids = inputs
            if K.dtype(position_ids) != 'int32':
                position_ids = K.cast(position_ids, 'int32')
        else:
            position_ids = K.arange(0, seq_len, dtype='int32')[None]

        if self.hierarchical:
            alpha = 0.4 if self.hierarchical is True else self.hierarchical
            embeddings = self.embeddings - alpha * self.embeddings[:1]
            embeddings = embeddings / (1 - alpha)
            embeddings_x = K.gather(embeddings, position_ids // self.input_dim)
            embeddings_y = K.gather(embeddings, position_ids % self.input_dim)
            pos_embeddings = alpha * embeddings_x + (1 - alpha) * embeddings_y
        else:
            if self.custom_position_ids:
                pos_embeddings = K.gather(self.embeddings, position_ids)
            else:
                pos_embeddings = self.embeddings[None, :seq_len]

        if self.merge_mode == 'add':
            return inputs + pos_embeddings
        elif self.merge_mode == 'mul':
            return inputs * pos_embeddings
        else:
            if not self.custom_position_ids:
                pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1])
            return K.concatenate([inputs, pos_embeddings])
예제 #7
0
파일: simbert.py 프로젝트: yyht/simbert
 def get_labels_of_similarity(self, y_pred):
     idxs = K.arange(0, K.shape(y_pred)[0])
     idxs_1 = idxs[None, :]
     idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]
     labels = K.equal(idxs_1, idxs_2)
     labels = K.cast(labels, K.floatx())
     return labels
def batch_gather(params, indices):
    """params.shape=[b, n, d],indices.shape=[b]
    从params的第i个序列中选出第indices[i]个向量,返回shape=[b, d]。
    """
    indices = K.cast(indices, 'int32')
    batch_idxs = K.arange(0, K.shape(indices)[0])
    indices = K.stack([batch_idxs, indices], 1)
    return tf.gather_nd(params, indices)
예제 #9
0
 def get_labels_of_similarity(self, y_pred):
     idxs = K.arange(0, K.shape(y_pred)[0])  # value=[0, ..., batch-1]
     idxs_1 = idxs[None, :]  # shape=(1, batch)
     idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]  # shape=(batch, 1)
     labels = K.equal(idxs_1,
                      idxs_2)  # eg: batch=2 [[False, True], [True, False]]
     labels = K.cast(labels, K.floatx())
     return labels
예제 #10
0
 def get_labels_of_similarity(self, y_pred):
     idxs = K.arange(0, K.shape(y_pred)[0])  # 0到btz (btz,)
     idxs_1 = idxs[None, :]  # (1, btz)
     idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None]  # (?,1)
     labels = K.equal(
         idxs_1,
         idxs_2)  # (btz, btz) 左右摇,相邻的两个btz是代表着相似的(generator中设置了前后颠倒)
     '''
     所以btz中,[0]是在第二个位置为True,[1]是在第一个位置为True, [2]是在第四个位置为True,[3]是在第三个位置为True。。。
     '''
     labels = K.cast(labels, K.floatx())  # 从true和false转成0 1
     '''
     [
         [0, 1, 0, 0, 0, 0],
         [1, 0, 0, 0, 0, 0],
         ...
         [0, 0, 0, 0, 0, 1],
         [0, 0, 0, 0, 1, 0]]
     '''
     return labels  # (btz, btz)
예제 #11
0
def seq_gather(x: list):
    """
    传入从传入的列表x中获取句子张量seq和下标idxs
    seq是[batch_size, seq_len, vector_size]的形状,
    idxs是[batch_size, 1]的形状
    在seq的第i个序列中选出第idxs[i]个向量,
    最终输出[batch_size, s_size]的向量。
    :param x: [seq, idxs] seq 原始序列的张量,idxs需要拆分的向量下标
    :return: 收集出来的字向量
    """
    # 获取句子张量以及字下标张量 idx = [[4],[9],[8],[11],[23],[45],[60],[30]]
    seq, idxs = x
    # 将下标数据类型转化为整型
    idxs = K.cast(idxs, 'int32')
    # 使用keras方法构造0-batch_size的张量[0,1,2,3,4,5,6,7]
    batch_idxs = K.arange(0, K.shape(seq)[0])
    # 在batch_idxs中扩充维度1,为的是与idx进行拼接后到seq中取切分向量[[0],[1],[2],[3],[4],[5],[6],[7]]
    batch_idxs = K.expand_dims(batch_idxs, 1)
    # 拼接idxs与batch_idx [[0,4],[1,9],[2,8],[3,11],[4,23],[5,45],[6,60],[7,30]]
    idxs = K.concatenate([batch_idxs, idxs], 1)
    # 对应idxs下标将seq中对应位置的向量收集出来
    return tf.gather_nd(seq, idxs)
예제 #12
0
 def call(self, inputs, q_mask=False, v_mask=False, a_mask=False):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     # 处理mask
     inputs = inputs[:]
     for i, mask in enumerate([q_mask, v_mask, a_mask]):
         if not mask:
             inputs.insert(3 + i, None)
     q, k, v, q_mask, v_mask = inputs[:5]
     if len(inputs) == 5:
         a_mask = 'history_only'
     elif len(inputs) == 6:
         a_mask = inputs[-1]
     else:
         raise ValueError('wrong inputs for MultiHeadAttention.')
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     o = sequence_masking(o, q_mask, 0)
     return o
예제 #13
0
 def call(self, inputs, q_mask=None, v_mask=None, a_mask=None):
     """实现多头注意力
     q_mask: 对输入的query序列的mask。
             主要是将输出结果的padding部分置0。
     v_mask: 对输入的value序列的mask。
             主要是防止attention读取到padding信息。
     a_mask: 对attention矩阵的mask。
             不同的attention mask对应不同的应用。
     """
     q, k, v = inputs[:3]
     if a_mask:
         if len(inputs) == 3:
             a_mask = 'history_only'
         else:
             a_mask = inputs[3]
     if q_mask is not None:
         if not hasattr(self, 'q_mask_layer'):
             self.q_mask_layer = search_layer(q, q_mask)
         q_mask = self.q_mask_layer.output_mask
     if v_mask is not None:
         if not hasattr(self, 'v_mask_layer'):
             self.v_mask_layer = search_layer(v, v_mask)
         v_mask = self.v_mask_layer.output_mask
     # Pooling
     if self.pool_size > 1:
         is_self_attention = (q is k is v)
         q_in_len = K.shape(q)[1]
         q = sequence_masking(q, q_mask, 0)
         q = divisible_temporal_padding(q, self.pool_size)
         q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg')
         if is_self_attention:
             k = v = q
         else:
             k = sequence_masking(k, v_mask, 0)
             k = divisible_temporal_padding(k, self.pool_size)
             k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg')
             v = sequence_masking(v, v_mask, 0)
             v = divisible_temporal_padding(v, self.pool_size)
             v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg')
         if v_mask is not None:
             v_mask = v_mask[:, ::self.pool_size]
         if a_mask is not None and not is_string(a_mask):
             a_mask = a_mask[..., ::self.pool_size, ::self.pool_size]
     # 线性变换
     qw = self.q_dense(q)
     kw = self.k_dense(k)
     vw = self.v_dense(v)
     # 形状变换
     qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size))
     kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size))
     vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size))
     # Attention
     a = tf.einsum('bjhd,bkhd->bhjk', qw, kw)
     # 相对位置编码
     if self.max_relative_position is not None:
         q_idxs = K.arange(0, K.shape(q)[1], dtype='int32')
         q_idxs = K.expand_dims(q_idxs, 1)
         v_idxs = K.arange(0, K.shape(v)[1], dtype='int32')
         v_idxs = K.expand_dims(v_idxs, 0)
         pos_ids = v_idxs - q_idxs
         pos_ids = K.clip(pos_ids, -self.max_relative_position,
                          self.max_relative_position)
         pos_ids = pos_ids + self.max_relative_position
         pos_embeddings = K.gather(self.relative_embeddings, pos_ids)
         a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings)
     # Attention(续)
     a = a / self.key_size**0.5
     a = sequence_masking(a, v_mask, 1, -1)
     if a_mask is not None:
         if is_string(a_mask):
             ones = K.ones_like(a[:1, :1])
             a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12
             a = a - a_mask
         else:
             a = a - (1 - a_mask) * 1e12
     a = K.softmax(a)
     # 完成输出
     o = tf.einsum('bhjk,bkhd->bjhd', a, vw)
     if self.max_relative_position is not None:
         o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings)
     o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim))
     o = self.o_dense(o)
     # 恢复长度
     if self.pool_size > 1:
         o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len]
     # 返回结果
     o = sequence_masking(o, q_mask, 0)
     return o