Exemple #1
0
    def _forward_unpadded(self,x):
        """Faster encoding that ignores any padding."""
        # Transpose batch and sequence dims
        x = x.transpose(perm = [1,0,2])

        # Encode all layers
        outputs = [x]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]

            # Apply dropout to hidden input
            if self.dropout_rate > 0:
                rnn_input = F.dropout(rnn_input,
                                      p=self.dropout_rate,
                                      training=self.training)
            # Forward
            rnn_output = self.rnns[i](rnn_input)[0]
            outputs.append(rnn_output)

        # Concat hidden layers
        if self.concat_layers:
            output = paddle.concat(outputs[1:], axis=2)
        else:
            output = outputs[-1]

        # Transpose back
        output = output.transpose(perm=[1,0,2])

        # Dropout on output layer
        if self.dropout_output and self.dropout_rate > 0:
            output = F.dropout(output,
                               p=self.dropout_rate,
                               training=self.training)
        return output
Exemple #2
0
    def forward(self, x):
        x0 = self.linear0(x[0])
        x1 = self.linear1(x[1])
        bs = x1.shape[0]
        if self.dropout_input > 0:
            x0 = F.dropout(x0, p=self.dropout_input, training=self.training)
            x1 = F.dropout(x1, p=self.dropout_input, training=self.training)
        x0_chunks = paddle.split(x0, self.chunks, -1)
        x1_chunks = paddle.split(x1, self.chunks, -1)
        zs = []
        for x0_c, x1_c, m0, m1 in zip(x0_chunks, x1_chunks, self.merge_linears0,
                                      self.merge_linears1):
            m = m0(x0_c) * m1(x1_c)  # bs x split_size*rank
            m = m.reshape([bs, self.rank, -1])
            z = paddle.sum(m, 1)
            if self.pos_norm == 'before_cat':
                z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
                z = F.normalize(z)
            zs.append(z)
        z = paddle.concat(zs, 1)
        if self.pos_norm == 'after_cat':
            z = paddle.sqrt(F.relu(z)) - paddle.sqrt(F.relu(-z))
            z = F.normalize(z)

        if self.dropout_pre_lin > 0:
            z = F.dropout(z, p=self.dropout_pre_lin, training=self.training)
        z = self.linear_out(z)
        if self.dropout_output > 0:
            z = F.dropout(z, p=self.dropout_output, training=self.training)
        return z
Exemple #3
0
 def forward(self, x, dropout):
     l1 = F.dropout(
         F.relu(self.lin1(x)), self.dropout, training=self.training)
     l2 = F.dropout(
         F.relu(self.lin2(l1)), self.dropout, training=self.training)
     l3 = self.lin3(l2)
     return l3
Exemple #4
0
    def forward(self, x, mask):
        """Forward pass of TransformerEncoderLayer.
        
        Parameters
        ----------
        x : Tensor [shape=(batch_size, time_steps, d_model)]
            The input.
            
        mask : Tensor
            The padding mask. The shape is (batch_size, time_steps, 
            time_steps) or broadcastable shape.
        
        Returns
        -------
        x :Tensor [shape=(batch_size, time_steps, d_model)]
            The encoded output.
            
        attn_weights : Tensor [shape=(batch_size, n_heads, time_steps, time_steps)]
            The attention weights of the self attention.
        """
        context_vector, attn_weights = self.self_mha(x, x, x, mask)
        x = self.layer_norm1(
            F.dropout(
                x + context_vector, self.dropout, training=self.training))

        x = self.layer_norm2(
            F.dropout(
                x + self.ffn(x), self.dropout, training=self.training))
        return x, attn_weights
Exemple #5
0
    def forward(self, g):
        x = g.node_feat["feat"]
        edge_feat = g.edge_feat["feat"]
        h_list = [self.atom_encoder(x)]

        ### virtual node embeddings for graphs
        virtualnode_embedding = self.virtualnode_embedding.expand(
            [g.num_graph, self.virtualnode_embedding.shape[-1]])

        for layer in range(self.num_layers):
            ### add message from virtual nodes to graph nodes
            h_list[layer] = h_list[layer] + paddle.gather(
                virtualnode_embedding, g.graph_node_id)

            ### Message passing among graph nodes
            h = self.convs[layer](g, h_list[layer], edge_feat)

            h = self.batch_norms[layer](h)
            if layer == self.num_layers - 1:
                #remove relu for the last layer
                h = F.dropout(h, self.drop_ratio, training=self.training)
            else:
                h = F.dropout(F.relu(h),
                              self.drop_ratio,
                              training=self.training)

            if self.residual:
                h = h + h_list[layer]

            h_list.append(h)

            ### update the virtual nodes
            if layer < self.num_layers - 1:
                ### add message from graph nodes to virtual nodes
                virtualnode_embedding_temp = self.pool(
                    g, h_list[layer]) + virtualnode_embedding
                ### transform virtual nodes using MLP

                if self.residual:
                    virtualnode_embedding = virtualnode_embedding + F.dropout(
                        self.mlp_virtualnode_list[layer]
                        (virtualnode_embedding_temp),
                        self.drop_ratio,
                        training=self.training)
                else:
                    virtualnode_embedding = F.dropout(
                        self.mlp_virtualnode_list[layer](
                            virtualnode_embedding_temp),
                        self.drop_ratio,
                        training=self.training)

        ### Different implementations of Jk-concat
        if self.JK == "last":
            node_representation = h_list[-1]
        elif self.JK == "sum":
            node_representation = 0
            for layer in range(self.num_layers):
                node_representation += h_list[layer]

        return node_representation
Exemple #6
0
    def forward(self, g):
        x = g.node_feat["feat"]
        edge_feat = g.edge_feat["feat"]

        ### computing input node embedding
        h_list = [self.atom_encoder(x)]
        for layer in range(self.num_layers):
            h = self.convs[layer](g, h_list[layer], edge_feat)
            h = self.batch_norms[layer](h)

            if layer == self.num_layers - 1:
                #remove relu for the last layer
                h = F.dropout(h, self.drop_ratio, training=self.training)
            else:
                h = F.dropout(F.relu(h),
                              self.drop_ratio,
                              training=self.training)

            if self.residual:
                h += h_list[layer]

            h_list.append(h)

        ### Different implementations of Jk-concat
        if self.JK == "last":
            node_representation = h_list[-1]
        elif self.JK == "sum":
            node_representation = 0
            for layer in range(self.num_layers):
                node_representation += h_list[layer]

        return node_representation
Exemple #7
0
    def forward(self, src_word, trg_word):
        src_max_len = paddle.shape(src_word)[-1]
        trg_max_len = paddle.shape(trg_word)[-1]
        base_attn_bias = paddle.cast(
            src_word == self.bos_id,
            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
        src_slf_attn_bias = base_attn_bias
        src_slf_attn_bias.stop_gradient = True
        trg_slf_attn_bias = paddle.tensor.triu(
            (paddle.ones(
                (trg_max_len, trg_max_len),
                dtype=paddle.get_default_dtype()) * -np.inf),
            1)
        trg_slf_attn_bias.stop_gradient = True
        trg_src_attn_bias = paddle.tile(base_attn_bias, [1, 1, trg_max_len, 1])
        src_pos = paddle.cast(
            src_word != self.bos_id, dtype="int64") * paddle.arange(
                start=0, end=src_max_len)
        trg_pos = paddle.cast(
            trg_word != self.bos_id, dtype="int64") * paddle.arange(
                start=0, end=trg_max_len)
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(
            src_emb, p=self.dropout,
            training=self.training) if self.dropout else src_emb
        with paddle.static.amp.fp16_guard():
            if self.waitk >= src_max_len or self.waitk == -1:
                # Full sentence
                enc_outputs = [
                    self.encoder(
                        enc_input, src_mask=src_slf_attn_bias)
                ]
            else:
                # Wait-k policy
                enc_outputs = []
                for i in range(self.waitk, src_max_len + 1):
                    enc_output = self.encoder(
                        enc_input[:, :i, :],
                        src_mask=src_slf_attn_bias[:, :, :, :i])
                    enc_outputs.append(enc_output)

            trg_emb = self.trg_word_embedding(trg_word)
            trg_pos_emb = self.trg_pos_embedding(trg_pos)
            trg_emb = trg_emb + trg_pos_emb
            dec_input = F.dropout(
                trg_emb, p=self.dropout,
                training=self.training) if self.dropout else trg_emb
            dec_output = self.decoder(
                dec_input,
                enc_outputs,
                tgt_mask=trg_slf_attn_bias,
                memory_mask=trg_src_attn_bias)

            predict = self.linear(dec_output)

        return predict
Exemple #8
0
    def forward(self, feed_dict):
        g = feed_dict["graph"]
        x = g.node_feat["feat"]
        edge_feat = g.edge_feat["feat"]

        h = self.atom_encoder(x)
        if self.config.exfeat:
            h += self.atom_encoder_float(g.node_feat["feat_float"])
        #  print("atom_encoder: ", np.sum(h.numpy()))

        if self.virtual_node:
            virtualnode_embedding = self.virtualnode_embedding.expand(
                    [g.num_graph, self.virtualnode_embedding.shape[-1]])
            h = h + paddle.gather(virtualnode_embedding, g.graph_node_id)
            #  print("virt0: ", np.sum(h.numpy()))

        if self.with_efeat:
            edge_emb = self.bond_encoder(edge_feat)
        else:
            edge_emb = edge_feat

        h = self.gnns[0](g, h, edge_emb)
        if self.config.graphnorm:
            h = self.gn(g, h)

        #  print("h0: ", np.sum(h.numpy()))
        for layer in range(1, self.num_layers):
            h1 = self.norms[layer-1](h)
            h2 = F.swish(h1)
            h2 = F.dropout(h2, p=self.drop_ratio, training=self.training)

            if self.virtual_node:
                virtualnode_embedding_temp = self.pool(g, h2) + virtualnode_embedding
                virtualnode_embedding = self.mlp_virtualnode_list[layer-1](virtualnode_embedding_temp)
                virtualnode_embedding  = F.dropout(
                        virtualnode_embedding,
                        self.drop_ratio,
                        training=self.training)

                h2 = h2 + paddle.gather(virtualnode_embedding, g.graph_node_id)
                #  print("virt_h%s: " % (layer), np.sum(h2.numpy()))

            h = self.gnns[layer](g, h2, edge_emb) + h
            if self.config.graphnorm:
                h = self.gn(g, h)
            #  print("h%s: " % (layer), np.sum(h.numpy()))

        h = self.norms[self.num_layers-1](h)
        h = F.dropout(h, p=self.drop_ratio, training=self.training)

        if self.config.appnp_k is not None:
            h = self.appnp(g, h)
        #  print("node_repr: ", np.sum(h.numpy()))
        node_representation = h
        return node_representation
Exemple #9
0
    def forward(self, x):
        x.stop_gradient = False
        x = x.transpose([0, 3, 2, 1])
        x = self.bn0(x)
        x = x.transpose([0, 3, 2, 1])

        x = self.conv_block1(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = self.conv_block2(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = self.conv_block3(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = self.conv_block4(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = self.conv_block5(x, pool_size=(2, 2), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = self.conv_block6(x, pool_size=(1, 1), pool_type='avg')
        x = F.dropout(x, p=0.2, training=self.training)

        x = x.mean(axis=3)
        x = x.max(axis=2) + x.mean(axis=2)

        x = F.dropout(x, p=0.5, training=self.training)
        x = F.relu(self.fc1(x))

        if self.extract_embedding:
            output = F.dropout(x, p=0.5, training=self.training)
        else:
            output = F.sigmoid(self.fc_audioset(x))
        return output
Exemple #10
0
    def forward(self, g):
        """tbd"""
        h = self.atom_embedding(g.node_feat)
        h += self.atom_float_embedding(g.node_feat)

        if self.virtual_node:
            virtualnode_embedding = self.virtualnode_embedding.expand(
                [g.num_graph, self.virtualnode_embedding.shape[-1]])
            h = h + paddle.gather(virtualnode_embedding, g.graph_node_id)
            #  print("virt0: ", np.sum(h.numpy()))

        if self.with_efeat:
            edge_emb = self.init_bond_embedding(g.edge_feat)
        else:
            edge_emb = g.edge_feat

        h = self.gnns[0](g, h, edge_emb)
        if self.config["graphnorm"]:
            h = self.gn(g, h)

        #  print("h0: ", np.sum(h.numpy()))
        for layer in range(1, self.num_layers):
            h1 = self.norms[layer - 1](h)
            h2 = F.swish(h1)
            h2 = F.dropout(h2, p=self.drop_ratio, training=self.training)

            if self.virtual_node:
                virtualnode_embedding_temp = self.pool(
                    g, h2) + virtualnode_embedding
                virtualnode_embedding = self.mlp_virtualnode_list[layer - 1](
                    virtualnode_embedding_temp)
                virtualnode_embedding = F.dropout(virtualnode_embedding,
                                                  self.drop_ratio,
                                                  training=self.training)

                h2 = h2 + paddle.gather(virtualnode_embedding, g.graph_node_id)
                #  print("virt_h%s: " % (layer), np.sum(h2.numpy()))

            h = self.gnns[layer](g, h2, edge_emb) + h
            if self.config["graphnorm"]:
                h = self.gn(g, h)
            #  print("h%s: " % (layer), np.sum(h.numpy()))

        h = self.norms[self.num_layers - 1](h)
        h = F.dropout(h, p=self.drop_ratio, training=self.training)

        h_graph = self.pool(g, h)
        # return graph, node, edge representation
        return h_graph, h, edge_emb
Exemple #11
0
    def forward(self, input, label=None):

        _, feat_list = self.backbone(input)

        x = feat_list[self.backbone_indices[1]]
        x = self.psp_module(x)
        x = F.dropout(x, dropout_prob=0.1)
        logit = self.conv(x)
        logit = fluid.layers.resize_bilinear(logit, input.shape[2:])

        if self.enable_auxiliary_loss:
            auxiliary_feat = feat_list[self.backbone_indices[0]]
            auxiliary_logit = self.fcn_head(auxiliary_feat)
            auxiliary_logit = fluid.layers.resize_bilinear(
                auxiliary_logit, input.shape[2:])

        if self.training:
            loss = model_utils.get_loss(logit, label)
            if self.enable_auxiliary_loss:
                auxiliary_loss = model_utils.get_loss(auxiliary_logit, label)
                loss += (0.4 * auxiliary_loss)
            return loss

        else:
            pred, score_map = model_utils.get_pred_score_map(logit)
            return pred, score_map
Exemple #12
0
    def decode(self, encoder_output, input, encoder_padding_mask):
        batch_size, T_dec, mel_dim = input.shape

        x = self.decoder_prenet(input, self.decoder_prenet_dropout)
        # twice its length if needed
        if x.shape[1] * self.r > self.decoder_pe.shape[0]:
            new_T = max(x.shape[1] * self.r, self.decoder_pe.shape[0] * 2)
            self.decoder_pe = pe.positional_encoding(0, new_T, self.d_decoder)
        pos_enc = self.decoder_pe[:T_dec * self.r:self.r, :]
        x = x.scale(math.sqrt(
            self.d_decoder)) + pos_enc * self.decoder_pe_scalar
        x = F.dropout(x, self.dropout, training=self.training)

        no_future_mask = masking.future_mask(T_dec, dtype=input.dtype)
        decoder_padding_mask = masking.feature_mask(
            input, axis=-1, dtype=input.dtype)
        decoder_mask = masking.combine_mask(
            decoder_padding_mask.unsqueeze(-1), no_future_mask)
        decoder_output, _, cross_attention_weights = self.decoder(
            x, encoder_output, encoder_output, encoder_padding_mask,
            decoder_mask, self.drop_n_heads)

        # use only parts of it
        output_proj = self.final_proj(decoder_output)[:, :, :self.r * mel_dim]
        mel_intermediate = paddle.reshape(output_proj,
                                          [batch_size, -1, mel_dim])
        stop_logits = self.stop_conditioner(mel_intermediate)

        # cnn postnet
        mel_channel_first = paddle.transpose(mel_intermediate, [0, 2, 1])
        mel_output = self.decoder_postnet(mel_channel_first)
        mel_output = paddle.transpose(mel_output, [0, 2, 1])

        return mel_output, mel_intermediate, cross_attention_weights, stop_logits
Exemple #13
0
 def _forward_ffn(self, x):
     # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual
     x_in = x
     x = self.layer_norm3(x)
     x = self.ffn(x)
     out = x_in + F.dropout(x, self.dropout, training=self.training)
     return out
Exemple #14
0
def forward_one_multilayer(rnns, lstm_input, layer_states, dropout_amount=0.):
    """ Goes forward for one multilayer RNN cell step.

    Args:
        lstm_input (`Tensor`): Some input to the step.
        layer_states (`list`): The states of each layer in the cell.
        dropout_amount (`float`, optional): The amount of dropout to apply, in
            between the layers.

    Returns:
        (`list` , `list`), `Tensor`, (`list`): Representing (each layer's cell memory, 
        each layer's cell hidden state), the final hidden state, and (each layer's updated RNNState).
    """
    num_layers = len(layer_states)
    new_states = []
    cell_states = []
    hidden_states = []
    state = lstm_input
    for i in range(num_layers):
        layer_h, new_state = rnns[i](paddle.unsqueeze(state, 0),
                                     layer_states[i])
        new_states.append(new_state)

        layer_h = layer_h.squeeze()
        layer_c = new_state[1].squeeze()

        state = layer_h
        if i < num_layers - 1:
            # p stands for probability of an element to be zeroed. i.e. p=1 means switch off all activations.
            state = F.dropout(state, p=dropout_amount)

        cell_states.append(layer_c)
        hidden_states.append(layer_h)

    return (cell_states, hidden_states), state, new_states
Exemple #15
0
    def forward(self,
                query_matrix,
                key_matrix,
                value_matrix,
                d_head,
                attn_mask=None,
                rand_mask_idx=None,
                query_mask=None,
                key_mask=None,
                dropout=None):
        # scale dot product attention
        product = paddle.matmul(x=query_matrix, y=key_matrix, transpose_y=True)
        product = product * (d_head**-0.5)
        product += (1 - paddle.matmul(query_mask, key_mask)) * -1e6
        if attn_mask is not None:
            product = product + attn_mask
        weights = F.softmax(product)
        if dropout:
            weights = F.dropout(weights,
                                dropout,
                                training=self.training,
                                mode="upscale_in_train")

        out = paddle.matmul(weights, value_matrix)
        return out
Exemple #16
0
    def forward(self, d, t, d_masking, t_masking):
        """MolTrans pipeline."""
        tempd_masking = d_masking.unsqueeze(1).unsqueeze(2)
        tempt_masking = t_masking.unsqueeze(1).unsqueeze(2)

        tempd_masking = (1.0 - tempd_masking) * -10000.0
        tempt_masking = (1.0 - tempt_masking) * -10000.0

        d_embedding = self.drug_emb(d)
        t_embedding = self.target_emb(t)

        d_encoder = self.encoder(d_embedding.float(), tempd_masking.float())
        t_encoder = self.encoder(t_embedding.float(), tempt_masking.float())

        drug_res = paddle.unsqueeze(d_encoder,
                                    2).repeat(1, 1, self.target_max_seq, 1)
        target_res = paddle.unsqueeze(t_encoder,
                                      1).repeat(1, self.drug_max_seq, 1, 1)

        i_score = drug_res * target_res

        i_scoreT = i_score.view(int(i_score.shape[0] / self.gpus), -1,
                                self.drug_max_seq, self.target_max_seq)
        i_scoreT = paddle.sum(i_scoreT, axis=1)
        i_scoreT = paddle.unsqueeze(i_scoreT, 1)
        i_scoreT = F.dropout(i_scoreT, p=self.dropout_ratio)

        i_scoreT = self.interaction_cnn(i_scoreT)
        i_res = i_scoreT.view(int(i_scoreT.shape[0] / self.gpus), -1)
        res = self.decoder(i_res)

        return res
Exemple #17
0
    def forward(self, hidden_states, attention_mask=None):
        x = self.i_dense(hidden_states)
        u, v, qk = paddle.split(
            self.activation(x),
            [
                self.intermediate_size, self.intermediate_size,
                self.attention_key_size
            ],
            axis=-1,
        )
        q, k = self.q_scaleoffset(qk), self.k_scaleoffset(qk)

        # apply_rotary
        q, k = self.rotary(q), self.rotary(k)

        # Attention
        a = paddle.matmul(q, k, transpose_y=True)

        if self.attention_scale:
            a = a / self.attention_key_size**0.5

        if attention_mask is not None:
            a = a * attention_mask + (attention_mask - 1) * INF

        A = attention_normalize(a,
                                attention_mask,
                                axis=-1,
                                method=self.normalization)

        A = F.dropout(A, p=self.attention_dropout, training=self.training)

        o = self.o_dense(u * paddle.matmul(A, v))

        return o
Exemple #18
0
    def forward(self, src_word):
        src_max_len = paddle.shape(src_word)[-1]
        src_slf_attn_bias = paddle.cast(
            src_word == self.bos_id,
            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
        src_pos = paddle.cast(src_word != self.bos_id,
                              dtype="int64") * paddle.arange(start=0,
                                                             end=src_max_len)

        # Run encoder
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(src_emb, p=self.dropout,
                              training=False) if self.dropout else src_emb
        enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)

        if self.use_fp16_decoding:
            enc_output = paddle.cast(enc_output, dtype="float16")

        mem_seq_lens = paddle.sum(paddle.cast(src_word != self.bos_id,
                                              dtype="int32"),
                                  axis=1)
        ids = self.decoding(enc_output, mem_seq_lens)

        return ids
    def forward(self, queries, keys, values, attn_bias, cache=None):
        # compute q ,k ,v
        keys = queries if keys is None else keys
        values = keys if values is None else values
        q, k, v = self._prepare_qkv(queries, keys, values, cache)

        # scale dot product attention
        product = paddle.matmul(x=q, y=k, transpose_y=True)
        product = product * self.d_model**-0.5
        if attn_bias is not None:
            product += attn_bias
        weights = F.softmax(product)
        if self.dropout_rate:
            weights = F.dropout(
                weights, p=self.dropout_rate, mode="downscale_in_infer")
        out = paddle.matmul(weights, v)

        # combine heads
        out = paddle.transpose(out, perm=[0, 2, 1, 3])
        out = paddle.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.proj_fc(out)

        return out
Exemple #20
0
    def forward(self, src_word):
        src_max_len = paddle.shape(src_word)[-1]
        src_slf_attn_bias = paddle.cast(
            src_word == self.bos_id,
            dtype=paddle.get_default_dtype()).unsqueeze([1, 2]) * -1e9
        trg_src_attn_bias = src_slf_attn_bias
        src_pos = paddle.cast(src_word != self.bos_id,
                              dtype="int64") * paddle.arange(start=0,
                                                             end=src_max_len)

        # Run encoder
        src_emb = self.src_word_embedding(src_word)
        src_pos_emb = self.src_pos_embedding(src_pos)
        src_emb = src_emb + src_pos_emb
        enc_input = F.dropout(src_emb, p=self.dropout,
                              training=False) if self.dropout else src_emb
        enc_output = self.transformer.encoder(enc_input, src_slf_attn_bias)

        # Init states (caches) for transformer, need to be updated according to selected beam
        incremental_cache, static_cache = self.transformer.decoder.gen_cache(
            enc_output, do_zip=True)

        static_cache, enc_output, trg_src_attn_bias = TransformerBeamSearchDecoder.tile_beam_merge_with_batch(
            (static_cache, enc_output, trg_src_attn_bias), self.beam_size)

        rs, _ = nn.decode.dynamic_decode(decoder=self.decode,
                                         inits=incremental_cache,
                                         max_step_num=self.max_out_len,
                                         memory=enc_output,
                                         trg_src_attn_bias=trg_src_attn_bias,
                                         static_cache=static_cache,
                                         is_test=True)

        return rs
Exemple #21
0
    def forward(self, inputs, states, static_cache, trg_src_attn_bias, memory):
        if states and static_cache:
            states = list(zip(states, static_cache))

        if self.word_embedding:
            if not isinstance(inputs, (list, tuple)):
                inputs = (inputs)

            word_emb = self.word_embedding(inputs[0])
            pos_emb = self.pos_embedding(inputs[1])
            word_emb = word_emb + pos_emb
            inputs = F.dropout(word_emb, p=self.dropout,
                               training=False) if self.dropout else word_emb

            cell_outputs, new_states = self.decoder(inputs, memory, None,
                                                    trg_src_attn_bias, states)
        else:
            cell_outputs, new_states = self.decoder(inputs, memory, None,
                                                    trg_src_attn_bias, states)

        if self.linear:
            cell_outputs = self.linear(cell_outputs)

        new_states = [cache[0] for cache in new_states]

        return cell_outputs, new_states
 def forward(self, x):
     hidden = self.fc1(x)
     hidden = F.relu(hidden)
     if self.dropout_rate:
         hidden = F.dropout(
             hidden, p=self.dropout_rate, mode="downscale_in_infer")
     out = self.fc2(hidden)
     return out
Exemple #23
0
 def forward(self, inputs):
     out_res = F.pad2d(inputs, [1, 1, 1, 1], mode="reflect")
     out_res = self.conv0(out_res)
     if self.dropout:
         out_res = F.dropout(out_res, p=0.5, mode='downscale_in_infer')
     out_res = F.pad2d(out_res, [1, 1, 1, 1], mode="reflect")
     out_res = self.conv1(out_res)
     return out_res + inputs
Exemple #24
0
    def forward(self, feats):
        for i in range(self.n_layers):
            feats = self.mlp[i](feats)
            feats = F.dropout(feats, p=self.drop, training=self.training)
            feats = F.relu(feats)
        out = self.out_layer(feats)

        return out
Exemple #25
0
    def forward(self, q, k, v, encoder_mask, decoder_mask):
        """Forward pass of TransformerEncoderLayer.
        
        Parameters
        ----------
        q : Tensor [shape=(batch_size, time_steps_q, d_model)] 
            The decoder input.
        k : Tensor [shape=(batch_size, time_steps_k, d_model)] 
            The keys.
        v : Tensor [shape=(batch_size, time_steps_k, d_model)]
            The values
        encoder_mask : Tensor
            Encoder padding mask, shape is ``(batch_size, time_steps_k, 
            time_steps_k)`` or broadcastable shape.
        decoder_mask : Tensor
            Decoder mask, shape is ``(batch_size, time_steps_q, time_steps_k)``
            or broadcastable shape. 
        
        Returns
        --------
        q : Tensor [shape=(batch_size, time_steps_q, d_model)]
            The decoder output.
            
        self_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_q)]
            Decoder self attention.
            
        cross_attn_weights : Tensor [shape=(batch_size, n_heads, time_steps_q, time_steps_k)] 
            Decoder-encoder cross attention.
        """
        context_vector, self_attn_weights = self.self_mha(q, q, q,
                                                          decoder_mask)
        q = self.layer_norm1(
            F.dropout(
                q + context_vector, self.dropout, training=self.training))

        context_vector, cross_attn_weights = self.cross_mha(q, k, v,
                                                            encoder_mask)
        q = self.layer_norm2(
            F.dropout(
                q + context_vector, self.dropout, training=self.training))

        q = self.layer_norm3(
            F.dropout(
                q + self.ffn(q), self.dropout, training=self.training))
        return q, self_attn_weights, cross_attn_weights
Exemple #26
0
    def forward(self,
                query,
                key,
                value,
                attn_mask=None,
                use_cache=False,
                cache=None):
        r"""
        Applies multi-head attention to map queries and a set of key-value pairs
        to outputs.
        """
        key = query if key is None else key
        value = query if value is None else value
        # compute q ,k ,v
        if use_cache is False:
            if self.fuse:
                q, k, v = self._fuse_prepare_qkv(query)
            else:
                q, k, v = self._prepare_qkv(query, key, value, use_cache,
                                            cache)
        else:
            q, k, v, cache = self._prepare_qkv(query, key, value, use_cache,
                                               cache)
        # scale dot product attention
        product = layers.matmul(x=q,
                                y=k,
                                transpose_y=True,
                                alpha=self.head_dim**-0.5)

        # if attn_mask is not None:
        # product = product + attn_mask
        # weights = F.softmax(product)

        weights = incubate.softmax_mask_fuse_upper_triangle(product)

        if self.dropout:
            with get_rng_state_tracker().rng_state('local_seed'):
                weights = F.dropout(weights,
                                    self.dropout,
                                    training=self.training,
                                    mode="upscale_in_train")

        out = tensor.matmul(weights, v)

        # combine heads
        out = tensor.transpose(out, perm=[0, 2, 1, 3])
        out = tensor.reshape(x=out, shape=[0, 0, out.shape[2] * out.shape[3]])

        # project to output
        out = self.out_proj(out)

        outs = [out]
        if self.need_weights:
            outs.append(weights)
        if use_cache:
            outs.append(cache)
        return out if len(outs) == 1 else tuple(outs)
Exemple #27
0
 def _forward_self_mha(self, x, mask, drop_n_heads):
     # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual
     x_in = x
     x = self.layer_norm1(x)
     context_vector, attn_weights = self.self_mha(x, x, x, mask,
                                                  drop_n_heads)
     context_vector = x_in + F.dropout(
         context_vector, self.dropout, training=self.training)
     return context_vector, attn_weights
Exemple #28
0
 def _forward_cross_mha(self, q, k, v, mask, drop_n_heads):
     # PreLN scheme: Norm -> SubLayer -> Dropout -> Residual
     q_in = q
     q = self.layer_norm2(q)
     context_vector, attn_weights = self.cross_mha(q, k, v, mask,
                                                   drop_n_heads)
     context_vector = q_in + F.dropout(
         context_vector, self.dropout, training=self.training)
     return context_vector, attn_weights
    def GetBaselineOut(self):
        paddle.disable_static(place=paddle.CUDAPlace(0))
        tensor_query = paddle.to_tensor(self.query, stop_gradient=False)
        if self.has_attn_mask:
            attn_mask = paddle.to_tensor(self.attn_mask, stop_gradient=False)
        else:
            attn_mask = None
        residual = tensor_query

        ln1_out = tensor_query
        if self.pre_layer_norm:
            ln1_out = self.norm1(tensor_query)

        q = self.q_proj(ln1_out)
        q = tensor.reshape(x=q, shape=[0, 0, self.num_heads, self.head_dim])
        q_out = tensor.transpose(x=q, perm=[0, 2, 1, 3])
        k = self.k_proj(ln1_out)
        v = self.v_proj(ln1_out)
        k = tensor.reshape(x=k, shape=[0, 0, self.num_heads, self.head_dim])
        k_out = tensor.transpose(x=k, perm=[0, 2, 1, 3])
        v = tensor.reshape(x=v, shape=[0, 0, self.num_heads, self.head_dim])
        v_out = tensor.transpose(x=v, perm=[0, 2, 1, 3])

        qk_out = layers.matmul(x=q_out,
                               y=k_out,
                               transpose_y=True,
                               alpha=self.head_dim**-0.5)

        if attn_mask is not None:
            attn_mask = _convert_attention_mask(attn_mask, qk_out.dtype)
            attn_mask_out = qk_out + attn_mask
            softmax_out = F.softmax(attn_mask_out)
        else:
            softmax_out = F.softmax(qk_out)

        if self.dropout_prob:
            dropout_out = F.dropout(softmax_out,
                                    self.dropout_prob,
                                    training=self.training,
                                    mode="upscale_in_train")
            qktv_out = tensor.matmul(dropout_out, v_out)
        else:
            qktv_out = tensor.matmul(softmax_out, v_out)

        fmha_out = tensor.transpose(qktv_out, perm=[0, 2, 1, 3])
        out_linear_in = tensor.reshape(
            x=fmha_out, shape=[0, 0, fmha_out.shape[2] * fmha_out.shape[3]])
        out = self.out_proj(out_linear_in)

        residual_out = residual + self.dropout(out)
        if not self.pre_layer_norm:
            final_out = self.norm1(residual_out)
        else:
            final_out = residual_out
        paddle.autograd.backward([final_out], [paddle.to_tensor(self.dout)],
                                 retain_graph=True)
        return final_out, tensor_query.grad
Exemple #30
0
def network():
    img = static.data(name='image', shape=[None, 784])
    hidden = static.nn.fc(input=img, size=200, act='relu')
    hidden = F.dropout(hidden, p=0.5)
    loss = F.cross_entropy(input=static.nn.fc(hidden, size=10, act='softmax'),
                           label=static.data(name='label',
                                             shape=[1],
                                             dtype='int64'))
    avg_loss = paddle.mean(loss)
    return avg_loss