def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) 对所有模型使用 32 个嵌入,其数值范围的大小以对数方式增加,最大偏移量为128,超过此偏移量,所有相对位置使用同一嵌入。 需要注意的是,某一给定层对超过 128 的相对位置不敏感,但是后续层可以通过组合来自先前层的局部信息来建立对更大偏移的敏感性。 """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] if self.custom_position_ids: inputs, position_ids = inputs else: position_ids = K.arange(0, seq_len, dtype=K.floatx())[None] indices = K.arange(0, self.output_dim // 2, dtype=K.floatx()) indices = K.pow(10000.0, -2 * indices / self.output_dim) pos_embeddings = tf.einsum('bn,d->bnd', position_ids, indices) pos_embeddings = K.concatenate([ K.sin(pos_embeddings)[..., None], K.cos(pos_embeddings)[..., None] ]) pos_embeddings = K.reshape( pos_embeddings, (-1, seq_len, self.output_dim) ) if self.merge_mode == 'add': return inputs + pos_embeddings elif self.merge_mode == 'mul': return inputs * pos_embeddings else: if not self.custom_position_ids: pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, pos_embeddings])
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ if self.custom_position_ids: seq_len = K.shape(inputs)[1] inputs, position_ids = inputs if 'float' not in K.dtype(position_ids): position_ids = K.cast(position_ids, K.floatx()) else: input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] position_ids = K.arange(0, seq_len, dtype=K.floatx())[None] indices = K.arange(0, self.output_dim // 2, dtype=K.floatx()) indices = K.pow(10000.0, -2 * indices / self.output_dim) embeddings = tf.einsum('bn,d->bnd', position_ids, indices) embeddings = K.stack([K.sin(embeddings), K.cos(embeddings)], axis=-1) embeddings = K.reshape(embeddings, (-1, seq_len, self.output_dim)) if self.merge_mode == 'add': return inputs + embeddings elif self.merge_mode == 'mul': return inputs * (embeddings + 1.0) else: if not self.custom_position_ids: embeddings = K.tile(embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, embeddings])
def compute_position_ids(self, inputs): q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 max_position = (self.input_dim - 1) // 2 pos_ids = K.clip(pos_ids, -max_position, max_position) pos_ids = pos_ids + max_position return pos_ids
def call(self, inputs): """如果custom_position_ids,那么第二个输入为自定义的位置id """ input_shape = K.shape(inputs) batch_size, seq_len = input_shape[0], input_shape[1] if self.custom_position_ids: inputs, position_ids = inputs if K.dtype(position_ids) != 'int32': position_ids = K.cast(position_ids, 'int32') else: position_ids = K.arange(0, seq_len, dtype='int32')[None] if self.hierarchical: alpha = 0.4 if self.hierarchical is True else self.hierarchical embeddings = self.embeddings - alpha * self.embeddings[:1] embeddings = embeddings / (1 - alpha) embeddings_x = K.gather(embeddings, position_ids // self.input_dim) embeddings_y = K.gather(embeddings, position_ids % self.input_dim) pos_embeddings = alpha * embeddings_x + (1 - alpha) * embeddings_y else: if self.custom_position_ids: pos_embeddings = K.gather(self.embeddings, position_ids) else: pos_embeddings = self.embeddings[None, :seq_len] if self.merge_mode == 'add': return inputs + pos_embeddings elif self.merge_mode == 'mul': return inputs * pos_embeddings else: if not self.custom_position_ids: pos_embeddings = K.tile(pos_embeddings, [batch_size, 1, 1]) return K.concatenate([inputs, pos_embeddings])
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) idxs_1 = idxs[None, :] idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] labels = K.equal(idxs_1, idxs_2) labels = K.cast(labels, K.floatx()) return labels
def batch_gather(params, indices): """params.shape=[b, n, d],indices.shape=[b] 从params的第i个序列中选出第indices[i]个向量,返回shape=[b, d]。 """ indices = K.cast(indices, 'int32') batch_idxs = K.arange(0, K.shape(indices)[0]) indices = K.stack([batch_idxs, indices], 1) return tf.gather_nd(params, indices)
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) # value=[0, ..., batch-1] idxs_1 = idxs[None, :] # shape=(1, batch) idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] # shape=(batch, 1) labels = K.equal(idxs_1, idxs_2) # eg: batch=2 [[False, True], [True, False]] labels = K.cast(labels, K.floatx()) return labels
def get_labels_of_similarity(self, y_pred): idxs = K.arange(0, K.shape(y_pred)[0]) # 0到btz (btz,) idxs_1 = idxs[None, :] # (1, btz) idxs_2 = (idxs + 1 - idxs % 2 * 2)[:, None] # (?,1) labels = K.equal( idxs_1, idxs_2) # (btz, btz) 左右摇,相邻的两个btz是代表着相似的(generator中设置了前后颠倒) ''' 所以btz中,[0]是在第二个位置为True,[1]是在第一个位置为True, [2]是在第四个位置为True,[3]是在第三个位置为True。。。 ''' labels = K.cast(labels, K.floatx()) # 从true和false转成0 1 ''' [ [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0], ... [0, 0, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0]] ''' return labels # (btz, btz)
def seq_gather(x: list): """ 传入从传入的列表x中获取句子张量seq和下标idxs seq是[batch_size, seq_len, vector_size]的形状, idxs是[batch_size, 1]的形状 在seq的第i个序列中选出第idxs[i]个向量, 最终输出[batch_size, s_size]的向量。 :param x: [seq, idxs] seq 原始序列的张量,idxs需要拆分的向量下标 :return: 收集出来的字向量 """ # 获取句子张量以及字下标张量 idx = [[4],[9],[8],[11],[23],[45],[60],[30]] seq, idxs = x # 将下标数据类型转化为整型 idxs = K.cast(idxs, 'int32') # 使用keras方法构造0-batch_size的张量[0,1,2,3,4,5,6,7] batch_idxs = K.arange(0, K.shape(seq)[0]) # 在batch_idxs中扩充维度1,为的是与idx进行拼接后到seq中取切分向量[[0],[1],[2],[3],[4],[5],[6],[7]] batch_idxs = K.expand_dims(batch_idxs, 1) # 拼接idxs与batch_idx [[0,4],[1,9],[2,8],[3,11],[4,23],[5,45],[6,60],[7,30]] idxs = K.concatenate([batch_idxs, idxs], 1) # 对应idxs下标将seq中对应位置的向量收集出来 return tf.gather_nd(seq, idxs)
def call(self, inputs, q_mask=False, v_mask=False, a_mask=False): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ # 处理mask inputs = inputs[:] for i, mask in enumerate([q_mask, v_mask, a_mask]): if not mask: inputs.insert(3 + i, None) q, k, v, q_mask, v_mask = inputs[:5] if len(inputs) == 5: a_mask = 'history_only' elif len(inputs) == 6: a_mask = inputs[-1] else: raise ValueError('wrong inputs for MultiHeadAttention.') # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) o = sequence_masking(o, q_mask, 0) return o
def call(self, inputs, q_mask=None, v_mask=None, a_mask=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 """ q, k, v = inputs[:3] if a_mask: if len(inputs) == 3: a_mask = 'history_only' else: a_mask = inputs[3] if q_mask is not None: if not hasattr(self, 'q_mask_layer'): self.q_mask_layer = search_layer(q, q_mask) q_mask = self.q_mask_layer.output_mask if v_mask is not None: if not hasattr(self, 'v_mask_layer'): self.v_mask_layer = search_layer(v, v_mask) v_mask = self.v_mask_layer.output_mask # Pooling if self.pool_size > 1: is_self_attention = (q is k is v) q_in_len = K.shape(q)[1] q = sequence_masking(q, q_mask, 0) q = divisible_temporal_padding(q, self.pool_size) q = pool1d(q, self.pool_size, self.pool_size, pool_mode='avg') if is_self_attention: k = v = q else: k = sequence_masking(k, v_mask, 0) k = divisible_temporal_padding(k, self.pool_size) k = pool1d(k, self.pool_size, self.pool_size, pool_mode='avg') v = sequence_masking(v, v_mask, 0) v = divisible_temporal_padding(v, self.pool_size) v = pool1d(v, self.pool_size, self.pool_size, pool_mode='avg') if v_mask is not None: v_mask = v_mask[:, ::self.pool_size] if a_mask is not None and not is_string(a_mask): a_mask = a_mask[..., ::self.pool_size, ::self.pool_size] # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 相对位置编码 if self.max_relative_position is not None: q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs pos_ids = K.clip(pos_ids, -self.max_relative_position, self.max_relative_position) pos_ids = pos_ids + self.max_relative_position pos_embeddings = K.gather(self.relative_embeddings, pos_ids) a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) # Attention(续) a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: if is_string(a_mask): ones = K.ones_like(a[:1, :1]) a_mask = (ones - tf.linalg.band_part(ones, -1, 0)) * 1e12 a = a - a_mask else: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if self.max_relative_position is not None: o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 恢复长度 if self.pool_size > 1: o = K.repeat_elements(o, self.pool_size, 1)[:, :q_in_len] # 返回结果 o = sequence_masking(o, q_mask, 0) return o