def forward(self, query, key=None, value=None, attn_mask=None, cache=None): """ Applies multi-head attention to map queries and a set of key-value pairs to outputs. Parameters: query (Tensor): The queries for multi-head attention. It is a tensor with shape `[batch_size, query_length, embed_dim]`. The data type should be float32 or float64. key (Tensor, optional): The keys for multi-head attention. It is a tensor with shape `[batch_size, key_length, kdim]`. The data type should be float32 or float64. If None, use `query` as `key`. Default None. value (Tensor, optional): The values for multi-head attention. It is a tensor with shape `[batch_size, value_length, vdim]`. The data type should be float32 or float64. If None, use `query` as `value`. Default None. attn_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (MultiHeadAttention.Cache|MultiHeadAttention.StaticCache, optional): Now, only None is supported. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `query`, representing attention output. """ if attn_mask is not None: # Support bool or int mask attn_mask = _convert_attention_mask(attn_mask, query.dtype) assert cache == None, "Only support cache is None now." out = incubate_f.fused_multi_head_attention( x=query, qkv_weight=self.qkv_weight, linear_weight=self.linear_weight, pre_layer_norm=self.normalize_before, pre_ln_scale=self.pre_ln_scale, pre_ln_bias=self.pre_ln_bias, ln_scale=self.ln_scale, ln_bias=self.ln_bias, pre_ln_epsilon=self._epsilon, qkv_bias=self.qkv_bias, linear_bias=self.linear_bias, attn_mask=attn_mask, dropout_rate=self.dropout_rate, attn_dropout_rate=self.attn_dropout_rate, ln_epsilon=self._epsilon, training=self.training, name=self.name) return out
def GetBaselineOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) tensor_query = paddle.to_tensor(self.query, stop_gradient=False) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None residual = tensor_query ln1_out = tensor_query if self.pre_layer_norm: ln1_out = self.norm1(tensor_query) q = self.q_proj(ln1_out) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(ln1_out) v = self.v_proj(ln1_out) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3]) qk_out = layers.matmul(x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) attn_mask_out = qk_out + attn_mask softmax_out = F.softmax(attn_mask_out) else: softmax_out = F.softmax(qk_out) if self.dropout_prob: dropout_out = F.dropout(softmax_out, self.dropout_prob, training=self.training, mode="upscale_in_train") qktv_out = tensor.matmul(dropout_out, v_out) else: qktv_out = tensor.matmul(softmax_out, v_out) fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3]) out_linear_in = tensor.reshape( x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]]) out = self.out_proj(out_linear_in) residual_out = residual + self.dropout(out) if not self.pre_layer_norm: final_out = self.norm1(residual_out) else: final_out = residual_out paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)], retain_graph=True) return final_out, tensor_query.grad
def forward(self, tgt, memory, tgt_mask=None, memory_mask=None, cache=None): """ Please refer to :class:`~paddlenlp.nn.TransformerDecoder` for more information regarding arguments and methods. """ tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) if memory is not None: memory_mask = _convert_attention_mask(memory_mask, memory.dtype) output = tgt new_caches = [] for i, mod in enumerate(self.layers): if cache is None: output = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=None) else: output, new_cache = mod(output, memory, tgt_mask=tgt_mask, memory_mask=memory_mask, cache=cache[i]) new_caches.append(new_cache) if self.norm is not None: output = self.norm(output) return output if cache is None else (output, new_caches)
def forward(self, src, src_mask=None, cache=None): """ Applies a Transformer encoder layer on the input. Parameters: src (Tensor): The input of Transformer encoder layer. It is a tensor with shape `[batch_size, sequence_length, d_model]`. The data type should be float32 or float64. src_mask (Tensor, optional): A tensor used in multi-head attention to prevents attention to some unwanted positions, usually the paddings or the subsequent positions. It is a tensor with shape broadcasted to `[batch_size, n_head, sequence_length, sequence_length]`. When the data type is bool, the unwanted positions have `False` values and the others have `True` values. When the data type is int, the unwanted positions have 0 values and the others have 1 values. When the data type is float, the unwanted positions have `-INF` values and the others have 0 values. It can be None when nothing wanted or needed to be prevented attention to. Default None. cache (Tensor, optional): It is an instance of `MultiHeadAttention.Cache`. See `TransformerEncoderLayer.gen_cache` for more details. It is only used for inference and should be None for training. Default None. Returns: Tensor|tuple: It is a tensor that has the same shape and data type \ as `enc_input`, representing the output of Transformer encoder \ layer. Or a tuple if `cache` is not None, except for encoder \ layer output, the tuple includes the new cache which is same \ as input `cache` argument but `incremental_cache` has an \ incremental length. See `MultiHeadAttention.gen_cache` and \ `MultiHeadAttention.forward` for more details. """ src_mask = _convert_attention_mask(src_mask, src.dtype) if cache is None: attn_out = self.fused_attn(src, attn_mask=src_mask) else: attn_out, incremental_cache = self.fused_attn(src, attn_mask=src_mask, cache=cache) ffn_out = self.ffn(attn_out) return ffn_out if cache is None else (ffn_out, incremental_cache)
def GetFusedMultiTransformerOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) q_proj_weight = paddle.to_tensor(self.q_proj.weight, stop_gradient=False) k_proj_weight = paddle.to_tensor(self.k_proj.weight, stop_gradient=False) v_proj_weight = paddle.to_tensor(self.v_proj.weight, stop_gradient=False) out_linear_weight = paddle.to_tensor(self.out_proj.weight, stop_gradient=False) ffn1_weight = paddle.to_tensor(self.ffn1_proj.weight, stop_gradient=False) ffn2_weight = paddle.to_tensor(self.ffn2_proj.weight, stop_gradient=False) if self.bias_attr is False: qkv_bias_tensor = None out_linear_bias = None else: q_proj_bias = paddle.to_tensor(self.q_proj.bias, stop_gradient=False) k_proj_bias = paddle.to_tensor(self.k_proj.bias, stop_gradient=False) v_proj_bias = paddle.to_tensor(self.v_proj.bias, stop_gradient=False) qkv_bias = np.concatenate( (q_proj_bias.numpy(), k_proj_bias.numpy(), v_proj_bias.numpy())) qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim)) qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False) out_linear_bias = paddle.to_tensor(self.out_proj.bias, stop_gradient=False) ffn1_bias = paddle.to_tensor(self.ffn1_proj.bias, stop_gradient=False) ffn2_bias = paddle.to_tensor(self.ffn2_proj.bias, stop_gradient=False) ln_scale = paddle.to_tensor(self.norm.weight, stop_gradient=False) ln_bias = paddle.to_tensor(self.norm.bias, stop_gradient=False) ffn_ln_scale = paddle.to_tensor(self.ffn_norm.weight, stop_gradient=False) ffn_ln_bias = paddle.to_tensor(self.ffn_norm.bias, stop_gradient=False) q_proj_weight = q_proj_weight.numpy().transpose((1, 0)) k_proj_weight = k_proj_weight.numpy().transpose((1, 0)) v_proj_weight = v_proj_weight.numpy().transpose((1, 0)) qkv_weight = np.concatenate( (q_proj_weight, k_proj_weight, v_proj_weight)) qkv_weight = qkv_weight.reshape( (3, self.num_heads, self.head_dim, self.embed_dim)) x = paddle.to_tensor(self.query, stop_gradient=False) cache_kvs, cache_kv = None, None time_step = None if self.has_cache_kv: cache_kvs = [] max_seq_length = (self.cache_length + 128) // 128 * 128 cache_kv = np.zeros([ 2, self.batch_size, self.num_heads, max_seq_length, self.head_dim ], dtype=self.x_type) elems = 4 if self.x_type is np.float16: elems = 8 assert self.head_dim % elems == 0 v_elems = self.head_dim // elems # [B, num_head, 128, head_dim] # cache_k_tmp = self.cache_kv[0, :] # [B, num_head, 128, head_dim / 4, 4] cache_k_tmp = self.cache_kv[0].reshape([ self.batch_size, self.num_heads, self.cache_length, v_elems, elems ]) # [B, num_head, head_dim / 4, 128, 4] cache_k_tmp = cache_k_tmp.transpose([0, 1, 3, 2, 4]) cache_kv[0, :].reshape([ self.batch_size, self.num_heads, v_elems, max_seq_length, elems ])[:, :, :, :self.cache_length, :] = cache_k_tmp cache_kv[1, :, :, :self.cache_length, :] = self.cache_kv[1] if self.gen_cache_kv: assert self.query_length == self.cache_length cache_kv[:] = 0 else: time_step = paddle.to_tensor([self.cache_length], dtype='int32', place=paddle.CPUPlace()) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None qkv_weight_tensor = paddle.to_tensor(qkv_weight, stop_gradient=False) epsilon = 1e-05 ln2_epsilon = 1e-05 if attn_mask is not None and self.attn_mask_type != np.bool: attn_mask = _convert_attention_mask(attn_mask, x.dtype) qkv_weights, qkv_biases = [], [] out_weights, out_biases = [], [] ln_scales, ln_biases = [], [] ffn1_weights, ffn1_biases = [], [] ffn2_weights, ffn2_biases = [], [] ffn_ln_scales, ffn_ln_biases = [], [] for i in range(self.layers): qkv_weights.append(qkv_weight_tensor) qkv_biases.append(qkv_bias_tensor) out_weights.append(out_linear_weight) out_biases.append(out_linear_bias) ln_scales.append(ln_scale) ln_biases.append(ln_bias) ffn1_weights.append(ffn1_weight) ffn1_biases.append(ffn1_bias) ffn2_weights.append(ffn2_weight) ffn2_biases.append(ffn2_bias) ffn_ln_scales.append(ffn_ln_scale) ffn_ln_biases.append(ffn_ln_bias) if self.has_cache_kv: cache_kvs.append( paddle.to_tensor(cache_kv, stop_gradient=False)) final_out = fused_multi_transformer(x, ln_scales, ln_biases, qkv_weights, qkv_biases, out_weights, out_biases, ffn_ln_scales, ffn_ln_biases, ffn1_weights, ffn1_biases, ffn2_weights, ffn2_biases, pre_layer_norm=self.pre_layer_norm, epsilon=epsilon, cache_kvs=cache_kvs, time_step=time_step, attn_mask=attn_mask, dropout_rate=self.dropout_prob, training=self.training) if self.has_cache_kv: return final_out[0], final_out[1] return final_out
def GetBaselineOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) tensor_query = paddle.to_tensor(self.query, stop_gradient=False) cache_kvs = [] cache_kv = None if self.has_cache_kv: cache_kv = paddle.to_tensor(self.cache_kv, stop_gradient=False) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None for i in range(self.layers): residual = tensor_query ln1_out = tensor_query if self.pre_layer_norm: ln1_out = self.norm(tensor_query) q = self.q_proj(ln1_out) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(ln1_out) v = self.v_proj(ln1_out) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3]) if self.has_cache_kv: # [1, B, n_head, cache_seq_len, head_dim] cache_k, cache_v = paddle.split(cache_kv, 2) cache_k = paddle.squeeze(cache_k, axis=0) cache_v = paddle.squeeze(cache_v, axis=0) # [B, n_head, cache_seq_len + seq_len, head_dim] # out_seq_len = cache_seq_len + seq_len if self.debug: print('q out is') print(q_out[0, 0, :, :]) print('cache k out seq=128') print(k_out[0, 0, :, :]) if self.gen_cache_kv: cache_kvs.append((k_out, v_out)) else: k_out = paddle.concat([cache_k, k_out], axis=-2) v_out = paddle.concat([cache_v, v_out], axis=-2) # [B, n_head, seq_len, head_dim] * [B, n_head, out_seq_len, head_dim] # --> [B, n_head, seq_len, out_seq_len] qk_out = layers.matmul(x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5) if self.debug: print('qk out is') print(qk_out[0][0][0]) if attn_mask is not None: attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) attn_mask_out = qk_out + attn_mask if self.debug: print('attn mask out is') print(attn_mask_out[0][0][0]) softmax_out = F.softmax(attn_mask_out) else: softmax_out = F.softmax(qk_out) if self.debug: print('softmax out is') print(softmax_out[0][0][0]) if self.dropout_prob: dropout_out = F.dropout(softmax_out, self.dropout_prob, training=self.training, mode="upscale_in_train") # [B, n_head, seq_len, out_seq_len] * [B, n_head, out_seq_len, head_dim] # --> [B, n_head, seq_len, head_dim] qktv_out = tensor.matmul(dropout_out, v_out) else: qktv_out = tensor.matmul(softmax_out, v_out) fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3]) if self.debug: print('fmha out is') print(fmha_out[0][0][0]) out_linear_in = tensor.reshape( x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]]) out = self.out_proj(out_linear_in) residual_out = residual + self.dropout(out) if not self.pre_layer_norm: attn_out = self.norm(residual_out) else: attn_out = residual_out ffn_ln_out = attn_out if self.pre_layer_norm: ffn_ln_out = self.ffn_norm(attn_out) ffn1_out = self.ffn1_proj(ffn_ln_out) ffn1_out = self.dropout(self.activation(ffn1_out)) ffn2_out = self.ffn2_proj(ffn1_out) residual_out = attn_out + self.dropout(ffn2_out) final_out = residual_out if not self.pre_layer_norm: final_out = self.ffn_norm(residual_out) tensor_query = final_out if self.has_cache_kv and self.gen_cache_kv: return final_out, cache_kvs return final_out
def forward(self, tgt, memory=None, tgt_mask=None, memory_mask=None, cache=None): """ Please refer to :class:`~paddlenlp.nn.TransformerDecoderLayer` for more information regarding arguments. """ tgt_mask = _convert_attention_mask(tgt_mask, tgt.dtype) residual = tgt if self.normalize_before: tgt = self.norm1(tgt) if cache is None: tgt = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=None) else: tgt, incremental_cache = self.self_attn(query=tgt, key=tgt, value=tgt, attn_mask=tgt_mask, cache=cache[0]) tgt = residual + self.dropout1(tgt) if not self.normalize_before: tgt = self.norm1(tgt) # Cross-attention will not be applied for BlenderbotSmallForCausalLM if memory is not None: residual = tgt if self.normalize_before: tgt = self.norm2(tgt) memory_mask = _convert_attention_mask(memory_mask, memory.dtype) if cache is None: tgt = self.cross_attn(query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=None) else: tgt, static_cache = self.cross_attn(query=tgt, key=memory, value=memory, attn_mask=memory_mask, cache=cache[1]) tgt = residual + self.dropout2(tgt) if not self.normalize_before: tgt = self.norm2(tgt) else: static_cache = cache[1] if cache is not None else None residual = tgt if self.normalize_before: tgt = self.norm3(tgt) tgt = self.linear2(self.dropout(self.activation(self.linear1(tgt)))) tgt = residual + self.dropout3(tgt) if not self.normalize_before: tgt = self.norm3(tgt) return tgt if cache is None else (tgt, (incremental_cache, static_cache))
def GetFusedAttentionOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) q_proj_weight = paddle.to_tensor(self.q_proj.weight, stop_gradient=False) k_proj_weight = paddle.to_tensor(self.k_proj.weight, stop_gradient=False) v_proj_weight = paddle.to_tensor(self.v_proj.weight, stop_gradient=False) out_linear_weight = paddle.to_tensor(self.out_proj.weight, stop_gradient=False) if self.bias_attr is False: qkv_bias_tensor = None out_linear_bias = None else: q_proj_bias = paddle.to_tensor(self.q_proj.bias, stop_gradient=False) k_proj_bias = paddle.to_tensor(self.k_proj.bias, stop_gradient=False) v_proj_bias = paddle.to_tensor(self.v_proj.bias, stop_gradient=False) qkv_bias = np.concatenate( (q_proj_bias.numpy(), k_proj_bias.numpy(), v_proj_bias.numpy())) qkv_bias = qkv_bias.reshape((3, self.num_heads, self.head_dim)) qkv_bias_tensor = paddle.to_tensor(qkv_bias, stop_gradient=False) out_linear_bias = paddle.to_tensor(self.out_proj.bias, stop_gradient=False) ln1_scale = paddle.to_tensor(self.norm1.weight, stop_gradient=False) ln1_bias = paddle.to_tensor(self.norm1.bias, stop_gradient=False) ln2_scale = paddle.to_tensor(self.norm2.weight, stop_gradient=False) ln2_bias = paddle.to_tensor(self.norm2.bias, stop_gradient=False) q_proj_weight = q_proj_weight.numpy().transpose((1, 0)) k_proj_weight = k_proj_weight.numpy().transpose((1, 0)) v_proj_weight = v_proj_weight.numpy().transpose((1, 0)) qkv_weight = np.concatenate( (q_proj_weight, k_proj_weight, v_proj_weight)) qkv_weight = qkv_weight.reshape( (3, self.num_heads, self.head_dim, self.embed_dim)) x = paddle.to_tensor(self.query, stop_gradient=False) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None qkv_weight_tensor = paddle.to_tensor(qkv_weight, stop_gradient=False) epsilon = 1e-05 ln2_epsilon = 1e-05 if attn_mask is not None: attn_mask = _convert_attention_mask(attn_mask, x.dtype) final_out = incubate_f.fused_multi_head_attention( x, qkv_weight_tensor, out_linear_weight, self.pre_layer_norm, ln1_scale, ln1_bias, ln2_scale, ln2_bias, epsilon, qkv_bias_tensor, out_linear_bias, attn_mask, self.dropout_prob, self.attn_dropout_prob, ln2_epsilon) paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)], retain_graph=True) return final_out, x.grad