def _forward_unpadded(self,x): """Faster encoding that ignores any padding.""" # Transpose batch and sequence dims x = x.transpose(perm = [1,0,2]) # Encode all layers outputs = [x] for i in range(self.num_layers): rnn_input = outputs[-1] # Apply dropout to hidden input if self.dropout_rate > 0: rnn_input = F.dropout(rnn_input, p=self.dropout_rate, training=self.training) # Forward rnn_output = self.rnns[i](rnn_input)[0] outputs.append(rnn_output) # Concat hidden layers if self.concat_layers: output = paddle.concat(outputs[1:], axis=2) else: output = outputs[-1] # Transpose back output = output.transpose(perm=[1,0,2]) # Dropout on output layer if self.dropout_output and self.dropout_rate > 0: output = F.dropout(output, p=self.dropout_rate, training=self.training) return output
def forward(self, x): x0 = self.linear0(x[0]) x1 = self.linear1(x[1]) bs = x1.shape[0] if self.dropout_input > 0: x0 = F.dropout(x0, p=self.dropout_input, training=self.training) x1 = F.dropout(x1, p=self.dropout_input, training=self.training) x0_chunks = paddle.split(x0, self.chunks, -1) x1_chunks = paddle.split(x1, self.chunks, -1) zs = [] for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0, self.merge_linears1): m = m0(x0_c) * m1(x1_c) # bs x split_size*rank m = m.reshape([bs, self.rank, -1]) z = paddle.sum(m, 1) if self.pos_norm == 'before_cat': z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) z = F.normalize(z) zs.append(z) z = paddle.concat(zs, 1) if self.pos_norm == 'after_cat': z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z)) z = F.normalize(z) if self.dropout_pre_lin > 0: z = F.dropout(z, p=self.dropout_pre_lin, training=self.training) z = self.linear_out(z) if self.dropout_output > 0: z = F.dropout(z, p=self.dropout_output, training=self.training) return z
def forward(self, x, dropout): l1 = F.dropout( F.relu(self.lin1(x)), self.dropout, training=self.training) l2 = F.dropout( F.relu(self.lin2(l1)), self.dropout, training=self.training) l3 = self.lin3(l2) return l3
def forward(self, x, mask): """Forward pass of TransformerEncoderLayer. Parameters ---------- x : Tensor [shape=(batch_size, time_steps, d_model)] The input. mask : Tensor The padding mask. The shape is (batch_size, time_steps, time_steps) or broadcastable shape. Returns ------- x :Tensor [shape=(batch_size, time_steps, d_model)] The encoded output. attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)] The attention weights of the self attention. """ context_vector, attn_weights = self.self_mha(x, x, x, mask) x = self.layer_norm1( F.dropout( x + context_vector, self.dropout, training=self.training)) x = self.layer_norm2( F.dropout( x + self.ffn(x), self.dropout, training=self.training)) return x, attn_weights
def forward(self, g): x = g.node_feat["feat"] edge_feat = g.edge_feat["feat"] h_list = [self.atom_encoder(x)] ### virtual node embeddings for graphs virtualnode_embedding = self.virtualnode_embedding.expand( [g.num_graph, self.virtualnode_embedding.shape[-1]]) for layer in range(self.num_layers): ### add message from virtual nodes to graph nodes h_list[layer] = h_list[layer] + paddle.gather( virtualnode_embedding, g.graph_node_id) ### Message passing among graph nodes h = self.convs[layer](g, h_list[layer], edge_feat) h = self.batch_norms[layer](h) if layer == self.num_layers - 1: #remove relu for the last layer h = F.dropout(h, self.drop_ratio, training=self.training) else: h = F.dropout(F.relu(h), self.drop_ratio, training=self.training) if self.residual: h = h + h_list[layer] h_list.append(h) ### update the virtual nodes if layer < self.num_layers - 1: ### add message from graph nodes to virtual nodes virtualnode_embedding_temp = self.pool( g, h_list[layer]) + virtualnode_embedding ### transform virtual nodes using MLP if self.residual: virtualnode_embedding = virtualnode_embedding + F.dropout( self.mlp_virtualnode_list[layer] (virtualnode_embedding_temp), self.drop_ratio, training=self.training) else: virtualnode_embedding = F.dropout( self.mlp_virtualnode_list[layer]( virtualnode_embedding_temp), self.drop_ratio, training=self.training) ### Different implementations of Jk-concat if self.JK == "last": node_representation = h_list[-1] elif self.JK == "sum": node_representation = 0 for layer in range(self.num_layers): node_representation += h_list[layer] return node_representation
def forward(self, g): x = g.node_feat["feat"] edge_feat = g.edge_feat["feat"] ### computing input node embedding h_list = [self.atom_encoder(x)] for layer in range(self.num_layers): h = self.convs[layer](g, h_list[layer], edge_feat) h = self.batch_norms[layer](h) if layer == self.num_layers - 1: #remove relu for the last layer h = F.dropout(h, self.drop_ratio, training=self.training) else: h = F.dropout(F.relu(h), self.drop_ratio, training=self.training) if self.residual: h += h_list[layer] h_list.append(h) ### Different implementations of Jk-concat if self.JK == "last": node_representation = h_list[-1] elif self.JK == "sum": node_representation = 0 for layer in range(self.num_layers): node_representation += h_list[layer] return node_representation
def forward(self, src_word, trg_word): src_max_len = paddle.shape(src_word)[-1] trg_max_len = paddle.shape(trg_word)[-1] base_attn_bias = paddle.cast( src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 src_slf_attn_bias = base_attn_bias src_slf_attn_bias.stop_gradient = True trg_slf_attn_bias = paddle.tensor.triu( (paddle.ones( (trg_max_len, trg_max_len), dtype=paddle.get_default_dtype()) * -np.inf), 1) trg_slf_attn_bias.stop_gradient = True trg_src_attn_bias = paddle.tile(base_attn_bias, [1, 1, trg_max_len, 1]) src_pos = paddle.cast( src_word != self.bos_id, dtype="int64") * paddle.arange( start=0, end=src_max_len) trg_pos = paddle.cast( trg_word != self.bos_id, dtype="int64") * paddle.arange( start=0, end=trg_max_len) src_emb = self.src_word_embedding(src_word) src_pos_emb = self.src_pos_embedding(src_pos) src_emb = src_emb + src_pos_emb enc_input = F.dropout( src_emb, p=self.dropout, training=self.training) if self.dropout else src_emb with paddle.static.amp.fp16_guard(): if self.waitk >= src_max_len or self.waitk == -1: # Full sentence enc_outputs = [ self.encoder( enc_input, src_mask=src_slf_attn_bias) ] else: # Wait-k policy enc_outputs = [] for i in range(self.waitk, src_max_len + 1): enc_output = self.encoder( enc_input[:, :i, :], src_mask=src_slf_attn_bias[:, :, :, :i]) enc_outputs.append(enc_output) trg_emb = self.trg_word_embedding(trg_word) trg_pos_emb = self.trg_pos_embedding(trg_pos) trg_emb = trg_emb + trg_pos_emb dec_input = F.dropout( trg_emb, p=self.dropout, training=self.training) if self.dropout else trg_emb dec_output = self.decoder( dec_input, enc_outputs, tgt_mask=trg_slf_attn_bias, memory_mask=trg_src_attn_bias) predict = self.linear(dec_output) return predict
def forward(self, feed_dict): g = feed_dict["graph"] x = g.node_feat["feat"] edge_feat = g.edge_feat["feat"] h = self.atom_encoder(x) if self.config.exfeat: h += self.atom_encoder_float(g.node_feat["feat_float"]) # print("atom_encoder: ", np.sum(h.numpy())) if self.virtual_node: virtualnode_embedding = self.virtualnode_embedding.expand( [g.num_graph, self.virtualnode_embedding.shape[-1]]) h = h + paddle.gather(virtualnode_embedding, g.graph_node_id) # print("virt0: ", np.sum(h.numpy())) if self.with_efeat: edge_emb = self.bond_encoder(edge_feat) else: edge_emb = edge_feat h = self.gnns[0](g, h, edge_emb) if self.config.graphnorm: h = self.gn(g, h) # print("h0: ", np.sum(h.numpy())) for layer in range(1, self.num_layers): h1 = self.norms[layer-1](h) h2 = F.swish(h1) h2 = F.dropout(h2, p=self.drop_ratio, training=self.training) if self.virtual_node: virtualnode_embedding_temp = self.pool(g, h2) + virtualnode_embedding virtualnode_embedding = self.mlp_virtualnode_list[layer-1](virtualnode_embedding_temp) virtualnode_embedding = F.dropout( virtualnode_embedding, self.drop_ratio, training=self.training) h2 = h2 + paddle.gather(virtualnode_embedding, g.graph_node_id) # print("virt_h%s: " % (layer), np.sum(h2.numpy())) h = self.gnns[layer](g, h2, edge_emb) + h if self.config.graphnorm: h = self.gn(g, h) # print("h%s: " % (layer), np.sum(h.numpy())) h = self.norms[self.num_layers-1](h) h = F.dropout(h, p=self.drop_ratio, training=self.training) if self.config.appnp_k is not None: h = self.appnp(g, h) # print("node_repr: ", np.sum(h.numpy())) node_representation = h return node_representation
def forward(self, x): x.stop_gradient = False x = x.transpose([0, 3, 2, 1]) x = self.bn0(x) x = x.transpose([0, 3, 2, 1]) x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg') x = F.dropout(x, p=0.2, training=self.training) x = x.mean(axis=3) x = x.max(axis=2) + x.mean(axis=2) x = F.dropout(x, p=0.5, training=self.training) x = F.relu(self.fc1(x)) if self.extract_embedding: output = F.dropout(x, p=0.5, training=self.training) else: output = F.sigmoid(self.fc_audioset(x)) return output
def forward(self, g): """tbd""" h = self.atom_embedding(g.node_feat) h += self.atom_float_embedding(g.node_feat) if self.virtual_node: virtualnode_embedding = self.virtualnode_embedding.expand( [g.num_graph, self.virtualnode_embedding.shape[-1]]) h = h + paddle.gather(virtualnode_embedding, g.graph_node_id) # print("virt0: ", np.sum(h.numpy())) if self.with_efeat: edge_emb = self.init_bond_embedding(g.edge_feat) else: edge_emb = g.edge_feat h = self.gnns[0](g, h, edge_emb) if self.config["graphnorm"]: h = self.gn(g, h) # print("h0: ", np.sum(h.numpy())) for layer in range(1, self.num_layers): h1 = self.norms[layer - 1](h) h2 = F.swish(h1) h2 = F.dropout(h2, p=self.drop_ratio, training=self.training) if self.virtual_node: virtualnode_embedding_temp = self.pool( g, h2) + virtualnode_embedding virtualnode_embedding = self.mlp_virtualnode_list[layer - 1]( virtualnode_embedding_temp) virtualnode_embedding = F.dropout(virtualnode_embedding, self.drop_ratio, training=self.training) h2 = h2 + paddle.gather(virtualnode_embedding, g.graph_node_id) # print("virt_h%s: " % (layer), np.sum(h2.numpy())) h = self.gnns[layer](g, h2, edge_emb) + h if self.config["graphnorm"]: h = self.gn(g, h) # print("h%s: " % (layer), np.sum(h.numpy())) h = self.norms[self.num_layers - 1](h) h = F.dropout(h, p=self.drop_ratio, training=self.training) h_graph = self.pool(g, h) # return graph, node, edge representation return h_graph, h, edge_emb
def forward(self, input, label=None): _, feat_list = self.backbone(input) x = feat_list[self.backbone_indices[1]] x = self.psp_module(x) x = F.dropout(x, dropout_prob=0.1) logit = self.conv(x) logit = fluid.layers.resize_bilinear(logit, input.shape[2:]) if self.enable_auxiliary_loss: auxiliary_feat = feat_list[self.backbone_indices[0]] auxiliary_logit = self.fcn_head(auxiliary_feat) auxiliary_logit = fluid.layers.resize_bilinear( auxiliary_logit, input.shape[2:]) if self.training: loss = model_utils.get_loss(logit, label) if self.enable_auxiliary_loss: auxiliary_loss = model_utils.get_loss(auxiliary_logit, label) loss += (0.4 * auxiliary_loss) return loss else: pred, score_map = model_utils.get_pred_score_map(logit) return pred, score_map
def decode(self, encoder_output, input, encoder_padding_mask): batch_size, T_dec, mel_dim = input.shape x = self.decoder_prenet(input, self.decoder_prenet_dropout) # twice its length if needed if x.shape[1] * self.r > self.decoder_pe.shape[0]: new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2) self.decoder_pe = pe.positional_encoding(0, new_T, self.d_decoder) pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :] x = x.scale(math.sqrt( self.d_decoder)) + pos_enc * self.decoder_pe_scalar x = F.dropout(x, self.dropout, training=self.training) no_future_mask = masking.future_mask(T_dec, dtype=input.dtype) decoder_padding_mask = masking.feature_mask( input, axis=-1, dtype=input.dtype) decoder_mask = masking.combine_mask( decoder_padding_mask.unsqueeze(-1), no_future_mask) decoder_output, _, cross_attention_weights = self.decoder( x, encoder_output, encoder_output, encoder_padding_mask, decoder_mask, self.drop_n_heads) # use only parts of it output_proj = self.final_proj(decoder_output)[:, :, :self.r * mel_dim] mel_intermediate = paddle.reshape(output_proj, [batch_size, -1, mel_dim]) stop_logits = self.stop_conditioner(mel_intermediate) # cnn postnet mel_channel_first = paddle.transpose(mel_intermediate, [0, 2, 1]) mel_output = self.decoder_postnet(mel_channel_first) mel_output = paddle.transpose(mel_output, [0, 2, 1]) return mel_output, mel_intermediate, cross_attention_weights, stop_logits
def _forward_ffn(self, x): # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual x_in = x x = self.layer_norm3(x) x = self.ffn(x) out = x_in + F.dropout(x, self.dropout, training=self.training) return out
def forward_one_multilayer(rnns, lstm_input, layer_states, dropout_amount=0.): """ Goes forward for one multilayer RNN cell step. Args: lstm_input (`Tensor`): Some input to the step. layer_states (`list`): The states of each layer in the cell. dropout_amount (`float`, optional): The amount of dropout to apply, in between the layers. Returns: (`list` , `list`), `Tensor`, (`list`): Representing (each layer's cell memory, each layer's cell hidden state), the final hidden state, and (each layer's updated RNNState). """ num_layers = len(layer_states) new_states = [] cell_states = [] hidden_states = [] state = lstm_input for i in range(num_layers): layer_h, new_state = rnns[i](paddle.unsqueeze(state, 0), layer_states[i]) new_states.append(new_state) layer_h = layer_h.squeeze() layer_c = new_state[1].squeeze() state = layer_h if i < num_layers - 1: # p stands for probability of an element to be zeroed. i.e. p=1 means switch off all activations. state = F.dropout(state, p=dropout_amount) cell_states.append(layer_c) hidden_states.append(layer_h) return (cell_states, hidden_states), state, new_states
def forward(self, query_matrix, key_matrix, value_matrix, d_head, attn_mask=None, rand_mask_idx=None, query_mask=None, key_mask=None, dropout=None): # scale dot product attention product = paddle.matmul(x=query_matrix, y=key_matrix, transpose_y=True) product = product * (d_head**-0.5) product += (1 - paddle.matmul(query_mask, key_mask)) * -1e6 if attn_mask is not None: product = product + attn_mask weights = F.softmax(product) if dropout: weights = F.dropout(weights, dropout, training=self.training, mode="upscale_in_train") out = paddle.matmul(weights, value_matrix) return out
def forward(self, d, t, d_masking, t_masking): """MolTrans pipeline.""" tempd_masking = d_masking.unsqueeze(1).unsqueeze(2) tempt_masking = t_masking.unsqueeze(1).unsqueeze(2) tempd_masking = (1.0 - tempd_masking) * -10000.0 tempt_masking = (1.0 - tempt_masking) * -10000.0 d_embedding = self.drug_emb(d) t_embedding = self.target_emb(t) d_encoder = self.encoder(d_embedding.float(), tempd_masking.float()) t_encoder = self.encoder(t_embedding.float(), tempt_masking.float()) drug_res = paddle.unsqueeze(d_encoder, 2).repeat(1, 1, self.target_max_seq, 1) target_res = paddle.unsqueeze(t_encoder, 1).repeat(1, self.drug_max_seq, 1, 1) i_score = drug_res * target_res i_scoreT = i_score.view(int(i_score.shape[0] / self.gpus), -1, self.drug_max_seq, self.target_max_seq) i_scoreT = paddle.sum(i_scoreT, axis=1) i_scoreT = paddle.unsqueeze(i_scoreT, 1) i_scoreT = F.dropout(i_scoreT, p=self.dropout_ratio) i_scoreT = self.interaction_cnn(i_scoreT) i_res = i_scoreT.view(int(i_scoreT.shape[0] / self.gpus), -1) res = self.decoder(i_res) return res
def forward(self, hidden_states, attention_mask=None): x = self.i_dense(hidden_states) u, v, qk = paddle.split( self.activation(x), [ self.intermediate_size, self.intermediate_size, self.attention_key_size ], axis=-1, ) q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk) # apply_rotary q, k = self.rotary(q), self.rotary(k) # Attention a = paddle.matmul(q, k, transpose_y=True) if self.attention_scale: a = a / self.attention_key_size**0.5 if attention_mask is not None: a = a * attention_mask + (attention_mask - 1) * INF A = attention_normalize(a, attention_mask, axis=-1, method=self.normalization) A = F.dropout(A, p=self.attention_dropout, training=self.training) o = self.o_dense(u * paddle.matmul(A, v)) return o
def forward(self, src_word): src_max_len = paddle.shape(src_word)[-1] src_slf_attn_bias = paddle.cast( src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 src_pos = paddle.cast(src_word != self.bos_id, dtype="int64") * paddle.arange(start=0, end=src_max_len) # Run encoder src_emb = self.src_word_embedding(src_word) src_pos_emb = self.src_pos_embedding(src_pos) src_emb = src_emb + src_pos_emb enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias) if self.use_fp16_decoding: enc_output = paddle.cast(enc_output, dtype="float16") mem_seq_lens = paddle.sum(paddle.cast(src_word != self.bos_id, dtype="int32"), axis=1) ids = self.decoding(enc_output, mem_seq_lens) return ids
def forward(self, queries, keys, values, attn_bias, cache=None): # compute q ,k ,v keys = queries if keys is None else keys values = keys if values is None else values q, k, v = self._prepare_qkv(queries, keys, values, cache) # scale dot product attention product = paddle.matmul(x=q, y=k, transpose_y=True) product = product * self.d_model**-0.5 if attn_bias is not None: product += attn_bias weights = F.softmax(product) if self.dropout_rate: weights = F.dropout( weights, p=self.dropout_rate, mode="downscale_in_infer") out = paddle.matmul(weights, v) # combine heads out = paddle.transpose(out, perm=[0, 2, 1, 3]) out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.proj_fc(out) return out
def forward(self, src_word): src_max_len = paddle.shape(src_word)[-1] src_slf_attn_bias = paddle.cast( src_word == self.bos_id, dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9 trg_src_attn_bias = src_slf_attn_bias src_pos = paddle.cast(src_word != self.bos_id, dtype="int64") * paddle.arange(start=0, end=src_max_len) # Run encoder src_emb = self.src_word_embedding(src_word) src_pos_emb = self.src_pos_embedding(src_pos) src_emb = src_emb + src_pos_emb enc_input = F.dropout(src_emb, p=self.dropout, training=False) if self.dropout else src_emb enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias) # Init states (caches) for transformer, need to be updated according to selected beam incremental_cache, static_cache = self.transformer.decoder.gen_cache( enc_output, do_zip=True) static_cache, enc_output, trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch( (static_cache, enc_output, trg_src_attn_bias), self.beam_size) rs, _ = nn.decode.dynamic_decode(decoder=self.decode, inits=incremental_cache, max_step_num=self.max_out_len, memory=enc_output, trg_src_attn_bias=trg_src_attn_bias, static_cache=static_cache, is_test=True) return rs
def forward(self, inputs, states, static_cache, trg_src_attn_bias, memory): if states and static_cache: states = list(zip(states, static_cache)) if self.word_embedding: if not isinstance(inputs, (list, tuple)): inputs = (inputs) word_emb = self.word_embedding(inputs[0]) pos_emb = self.pos_embedding(inputs[1]) word_emb = word_emb + pos_emb inputs = F.dropout(word_emb, p=self.dropout, training=False) if self.dropout else word_emb cell_outputs, new_states = self.decoder(inputs, memory, None, trg_src_attn_bias, states) else: cell_outputs, new_states = self.decoder(inputs, memory, None, trg_src_attn_bias, states) if self.linear: cell_outputs = self.linear(cell_outputs) new_states = [cache[0] for cache in new_states] return cell_outputs, new_states
def forward(self, x): hidden = self.fc1(x) hidden = F.relu(hidden) if self.dropout_rate: hidden = F.dropout( hidden, p=self.dropout_rate, mode="downscale_in_infer") out = self.fc2(hidden) return out
def forward(self, inputs): out_res = F.pad2d(inputs, [1, 1, 1, 1], mode="reflect") out_res = self.conv0(out_res) if self.dropout: out_res = F.dropout(out_res, p=0.5, mode='downscale_in_infer') out_res = F.pad2d(out_res, [1, 1, 1, 1], mode="reflect") out_res = self.conv1(out_res) return out_res + inputs
def forward(self, feats): for i in range(self.n_layers): feats = self.mlp[i](feats) feats = F.dropout(feats, p=self.drop, training=self.training) feats = F.relu(feats) out = self.out_layer(feats) return out
def forward(self, q, k, v, encoder_mask, decoder_mask): """Forward pass of TransformerEncoderLayer. Parameters ---------- q : Tensor [shape=(batch_size, time_steps_q, d_model)] The decoder input. k : Tensor [shape=(batch_size, time_steps_k, d_model)] The keys. v : Tensor [shape=(batch_size, time_steps_k, d_model)] The values encoder_mask : Tensor Encoder padding mask, shape is ``(batch_size, time_steps_k, time_steps_k)`` or broadcastable shape. decoder_mask : Tensor Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)`` or broadcastable shape. Returns -------- q : Tensor [shape=(batch_size, time_steps_q, d_model)] The decoder output. self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)] Decoder self attention. cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] Decoder-encoder cross attention. """ context_vector, self_attn_weights = self.self_mha(q, q, q, decoder_mask) q = self.layer_norm1( F.dropout( q + context_vector, self.dropout, training=self.training)) context_vector, cross_attn_weights = self.cross_mha(q, k, v, encoder_mask) q = self.layer_norm2( F.dropout( q + context_vector, self.dropout, training=self.training)) q = self.layer_norm3( F.dropout( q + self.ffn(q), self.dropout, training=self.training)) return q, self_attn_weights, cross_attn_weights
def forward(self, query, key, value, attn_mask=None, use_cache=False, cache=None): r""" Applies multi-head attention to map queries and a set of key-value pairs to outputs. """ key = query if key is None else key value = query if value is None else value # compute q ,k ,v if use_cache is False: if self.fuse: q, k, v = self._fuse_prepare_qkv(query) else: q, k, v = self._prepare_qkv(query, key, value, use_cache, cache) else: q, k, v, cache = self._prepare_qkv(query, key, value, use_cache, cache) # scale dot product attention product = layers.matmul(x=q, y=k, transpose_y=True, alpha=self.head_dim**-0.5) # if attn_mask is not None: # product = product + attn_mask # weights = F.softmax(product) weights = incubate.softmax_mask_fuse_upper_triangle(product) if self.dropout: with get_rng_state_tracker().rng_state('local_seed'): weights = F.dropout(weights, self.dropout, training=self.training, mode="upscale_in_train") out = tensor.matmul(weights, v) # combine heads out = tensor.transpose(out, perm=[0, 2, 1, 3]) out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]]) # project to output out = self.out_proj(out) outs = [out] if self.need_weights: outs.append(weights) if use_cache: outs.append(cache) return out if len(outs) == 1 else tuple(outs)
def _forward_self_mha(self, x, mask, drop_n_heads): # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual x_in = x x = self.layer_norm1(x) context_vector, attn_weights = self.self_mha(x, x, x, mask, drop_n_heads) context_vector = x_in + F.dropout( context_vector, self.dropout, training=self.training) return context_vector, attn_weights
def _forward_cross_mha(self, q, k, v, mask, drop_n_heads): # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual q_in = q q = self.layer_norm2(q) context_vector, attn_weights = self.cross_mha(q, k, v, mask, drop_n_heads) context_vector = q_in + F.dropout( context_vector, self.dropout, training=self.training) return context_vector, attn_weights
def GetBaselineOut(self): paddle.disable_static(place=paddle.CUDAPlace(0)) tensor_query = paddle.to_tensor(self.query, stop_gradient=False) if self.has_attn_mask: attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False) else: attn_mask = None residual = tensor_query ln1_out = tensor_query if self.pre_layer_norm: ln1_out = self.norm1(tensor_query) q = self.q_proj(ln1_out) q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim]) q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3]) k = self.k_proj(ln1_out) v = self.v_proj(ln1_out) k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim]) k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3]) v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim]) v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3]) qk_out = layers.matmul(x=q_out, y=k_out, transpose_y=True, alpha=self.head_dim**-0.5) if attn_mask is not None: attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype) attn_mask_out = qk_out + attn_mask softmax_out = F.softmax(attn_mask_out) else: softmax_out = F.softmax(qk_out) if self.dropout_prob: dropout_out = F.dropout(softmax_out, self.dropout_prob, training=self.training, mode="upscale_in_train") qktv_out = tensor.matmul(dropout_out, v_out) else: qktv_out = tensor.matmul(softmax_out, v_out) fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3]) out_linear_in = tensor.reshape( x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]]) out = self.out_proj(out_linear_in) residual_out = residual + self.dropout(out) if not self.pre_layer_norm: final_out = self.norm1(residual_out) else: final_out = residual_out paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)], retain_graph=True) return final_out, tensor_query.grad
def network(): img = static.data(name='image', shape=[None, 784]) hidden = static.nn.fc(input=img, size=200, act='relu') hidden = F.dropout(hidden, p=0.5) loss = F.cross_entropy(input=static.nn.fc(hidden, size=10, act='softmax'), label=static.data(name='label', shape=[1], dtype='int64')) avg_loss = paddle.mean(loss) return avg_loss