Beispiel #1
0
    def forward(self, x, encoder_padding_mask):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        x = self.maybe_layer_norm(0, x, before=True)
        x, _ = self.self_attn(query=x, key=x, value=x, key_padding_mask=encoder_padding_mask)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(0, x, after=True)

        residual = x
        x = self.maybe_layer_norm(1, x, before=True)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.relu_dropout, training=self.training)
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(1, x, after=True)
        return x
Beispiel #2
0
    def forward(self, input, format ='index'):
        if format == 'onehot':
            out = F.dropout(self.Linear(input), self.d, training=self.training)
        elif format == 'index':
            out = F.dropout(self.word_embed(input), self.d, training=self.training)

        return out
Beispiel #3
0
 def forward(self, x):
     x = F.relu(self.linear1(x))
     x = F.dropout(x, 0.8)
     x = F.relu(self.linear2(x))
     x = F.dropout(x, 0.8)
     x = F.log_softmax(self.linear3(x))
     return x
    def forward(self, h_out, fake_region, conv_feat, conv_feat_embed):

        # View into three dimensions
        att_size = conv_feat.numel() // conv_feat.size(0) // self.rnn_size
        conv_feat = conv_feat.view(-1, att_size, self.rnn_size)
        conv_feat_embed = conv_feat_embed.view(-1, att_size, self.att_hid_size)

        # view neighbor from bach_size * neighbor_num x rnn_size to bach_size x rnn_size * neighbor_num
        fake_region = self.fr_linear(fake_region)
        fake_region_embed = self.fr_embed(fake_region)

        h_out_linear = self.ho_linear(h_out)
        h_out_embed = self.ho_embed(h_out_linear)

        txt_replicate = h_out_embed.unsqueeze(1).expand(h_out_embed.size(0), att_size + 1, h_out_embed.size(1))

        img_all = torch.cat([fake_region.view(-1,1,self.input_encoding_size), conv_feat], 1)
        img_all_embed = torch.cat([fake_region_embed.view(-1,1,self.input_encoding_size), conv_feat_embed], 1)

        hA = F.tanh(img_all_embed + txt_replicate)
        hA = F.dropout(hA,self.drop_prob_lm, self.training)
        
        hAflat = self.alpha_net(hA.view(-1, self.att_hid_size))
        PI = F.softmax(hAflat.view(-1, att_size + 1))

        visAtt = torch.bmm(PI.unsqueeze(1), img_all)
        visAttdim = visAtt.squeeze(1)

        atten_out = visAttdim + h_out_linear

        h = F.tanh(self.att2h(atten_out))
        h = F.dropout(h, self.drop_prob_lm, self.training)
        return h
Beispiel #5
0
    def _forward_unpadded(self, x, x_mask):
        """Faster encoding that ignores any padding."""
        # Transpose batch and sequence dims
        x = x.transpose(0, 1)

        # Encode all layers
        outputs = [x]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]

            # Apply dropout to hidden input
            if self.dropout_rate > 0:
                rnn_input = F.dropout(rnn_input,
                                      p=self.dropout_rate,
                                      training=self.training)
            # Forward
            rnn_output = self.rnns[i](rnn_input)[0]
            outputs.append(rnn_output)

        # Concat hidden layers
        if self.concat_layers:
            output = torch.cat(outputs[1:], 2)
        else:
            output = outputs[-1]

        # Transpose back
        output = output.transpose(0, 1)

        # Dropout on output layer
        if self.dropout_output and self.dropout_rate > 0:
            output = F.dropout(output,
                               p=self.dropout_rate,
                               training=self.training)
        return output
Beispiel #6
0
 def forward(self, x):
     y = F.dropout(F.relu(self.linears[0](x)), self.training)
     for layer in self.linears[1:-1]:
         y = F.relu(layer(y))
         y = F.dropout(y, self.training)
     y = F.log_softmax(self.linears[-1](y))
     return y
Beispiel #7
0
    def forward(self, src_tokens):
        bsz, seqlen = src_tokens.size()
        num_layers = len(self.layers)

        # embed tokens
        x = self.embed_tokens(src_tokens)
        x = F.dropout(x, p=self.dropout_in, training=self.training)
        embed_dim = x.size(2)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        final_hiddens, final_cells = [], []
        outs = [x[j] for j in range(seqlen)]
        for i, rnn in enumerate(self.layers):
            hidden = Variable(x.data.new(bsz, embed_dim).zero_())
            cell = Variable(x.data.new(bsz, embed_dim).zero_())
            for j in range(seqlen):
                # recurrent cell
                hidden, cell = rnn(outs[j], (hidden, cell))

                # store the most recent hidden state in outs, either to be used
                # as the input for the next layer, or as the final output
                outs[j] = F.dropout(hidden, p=self.dropout_out, training=self.training)

            # save the final hidden and cell states for every layer
            final_hiddens.append(hidden)
            final_cells.append(cell)

        # collect outputs across time steps
        x = torch.cat(outs, dim=0).view(seqlen, bsz, embed_dim)
        final_hiddens = torch.cat(final_hiddens, dim=0).view(num_layers, bsz, embed_dim)
        final_cells = torch.cat(final_cells, dim=0).view(num_layers, bsz, embed_dim)

        return x, final_hiddens, final_cells
Beispiel #8
0
    def forward(self, inp, hidden=None, schedule=None, **kwargs):
        """
        Parameters:
        -----------
        inp: torch.Tensor (seq_len x batch_size)

        Returns:
        --------
        outs: torch.Tensor (seq_len * batch_size x vocab)
        hidden: see output of RNN, GRU, LSTM in torch.nn
        weights: None or list of weights (batch_size x seq_len),
            It will only be not None if attention is provided.
        """
        inp = word_dropout(
            inp, self.target_code, p=self.word_dropout,
            reserved_codes=self.reserved_codes, training=self.training)
        emb = self.embeddings(inp)
        if self.has_dropout:
            emb = F.dropout(emb, p=self.dropout, training=self.training)
        outs, hidden = self.rnn(emb, hidden or self.init_hidden_for(emb))
        if self.has_dropout:
            outs = F.dropout(outs, p=self.dropout, training=self.training)
        weights = None
        if self.add_attn:
            outs, weights = self.attn(outs, emb)
        seq_len, batch, hid_dim = outs.size()
        outs = outs.view(seq_len * batch, hid_dim)
        if self.add_deepout:
            outs = self.deepout(outs)
        outs = F.log_softmax(self.project(outs))
        return outs, hidden, weights
 def forward(self, x):
     x = x.view(-1, 28 * 28)
     x = F.relu(self.fc1(x))
     x = F.dropout(x, p=0.8, training=self.training)
     x = F.relu(self.fc2(x))
     x = F.dropout(x, p=0.8, training=self.training)
     x = self.fc3(x)
     return x
    def forward(self, xt, img_fc, state):

        hs = []
        cs = []
        for L in range(self.num_layers):
            # c,h from previous timesteps
            prev_h = state[0][L]
            prev_c = state[1][L]
            # the input to this layer
            if L == 0:
                x = xt
                i2h = self.w2h(x) + self.v2h(img_fc)
            else:
                x = hs[-1]
                x = F.dropout(x, self.drop_prob_lm, self.training)
                i2h = self.i2h[L-1](x)

            all_input_sums = i2h+self.h2h[L](prev_h)

            sigmoid_chunk = all_input_sums.narrow(1, 0, 3 * self.rnn_size)
            sigmoid_chunk = F.sigmoid(sigmoid_chunk)
            # decode the gates
            in_gate = sigmoid_chunk.narrow(1, 0, self.rnn_size)
            forget_gate = sigmoid_chunk.narrow(1, self.rnn_size, self.rnn_size)
            out_gate = sigmoid_chunk.narrow(1, self.rnn_size * 2, self.rnn_size)
            # decode the write inputs
            if not self.use_maxout:
                in_transform = F.tanh(all_input_sums.narrow(1, 3 * self.rnn_size, self.rnn_size))
            else:
                in_transform = all_input_sums.narrow(1, 3 * self.rnn_size, 2 * self.rnn_size)
                in_transform = torch.max(\
                    in_transform.narrow(1, 0, self.rnn_size),
                    in_transform.narrow(1, self.rnn_size, self.rnn_size))
            # perform the LSTM update
            next_c = forget_gate * prev_c + in_gate * in_transform
            # gated cells form the output
            tanh_nex_c = F.tanh(next_c)
            next_h = out_gate * tanh_nex_c
            if L == self.num_layers-1:
                if L == 0:
                    i2h = self.r_w2h(x) + self.r_v2h(img_fc)
                else:
                    i2h = self.r_i2h(x)
                n5 = i2h+self.r_h2h(prev_h)
                fake_region = F.sigmoid(n5) * tanh_nex_c

            cs.append(next_c)
            hs.append(next_h)

        # set up the decoder
        top_h = hs[-1]
        top_h = F.dropout(top_h, self.drop_prob_lm, self.training)
        fake_region = F.dropout(fake_region, self.drop_prob_lm, self.training)

        state = (torch.cat([_.unsqueeze(0) for _ in hs], 0), 
                torch.cat([_.unsqueeze(0) for _ in cs], 0))
        return top_h, fake_region, state
Beispiel #11
0
 def forward(self, inputs): # inputs (batch size, "sentence" length) bs,n
     embeds = self.embeddings(inputs) # bs,n,300
     embeds = embeds.view(-1,n*300) # bs,n*300
     out = F.tanh(self.h(embeds)) # bs,hidden_size
     out = self.u(F.dropout(out,p=dropout_rate)) # bs,|V|
     embeds = F.dropout(embeds,p=dropout_rate)
     out += self.w(embeds) # bs,|V|
     #out = F.softmax(out,dim=1)
     return out
Beispiel #12
0
 def hidden_to_idx(self, hidden, is_training=False):
     """Convert hidden state vectors into indices into the dictionary."""
     # dropout at each step
     e = F.dropout(self.h2e(hidden), p=self.dropout, training=is_training)
     scores = F.dropout(self.e2o(e), p=self.dropout, training=is_training)
     # skip zero (null_idx) when selecting a score
     _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2)
     # add one back to index since we removed first option
     return idx.add_(1), scores
    def forward(self, input):
        x = F.leaky_relu(self.fc1(input), 0.2)
        x = F.dropout(x, 0.3)
        x = F.leaky_relu(self.fc2(x), 0.2)
        x = F.dropout(x, 0.3)
        x = F.leaky_relu(self.fc3(x), 0.2)
        x = F.dropout(x, 0.3)
        x = F.sigmoid(self.fc4(x))

        return x
Beispiel #14
0
 def forward(self, x):
     x = F.relu(self.conv1(x))      # 28x28x32 -> 26x26x32
     x = F.relu(self.conv2(x))      # 26x26x32 -> 24x24x64
     x = F.max_pool2d(x, 2) # 24x24x64 -> 12x12x64
     x = F.dropout(x, p=0.25, training=self.training)
     x = x.view(-1, 12*12*64)       # flatten 12x12x64 = 9216
     x = F.relu(self.fc1(x))        # fc 9216 -> 128
     x = F.dropout(x, p=0.5, training=self.training)
     x = self.fc2(x)                # fc 128 -> 10
     return F.log_softmax(x, dim=1) # to 10 logits
Beispiel #15
0
    def _forward_padded(self, x, x_mask):
        """Slower (significantly), but more precise,
        encoding that handles padding."""
        # Compute sorted sequence lengths
        lengths = x_mask.data.eq(0).long().sum(1).squeeze()
        _, idx_sort = torch.sort(lengths, dim=0, descending=True)
        _, idx_unsort = torch.sort(idx_sort, dim=0)

        lengths = list(lengths[idx_sort])
        idx_sort = Variable(idx_sort)
        idx_unsort = Variable(idx_unsort)

        # Sort x
        x = x.index_select(0, idx_sort)

        # Transpose batch and sequence dims
        x = x.transpose(0, 1)

        # Pack it up
        rnn_input = nn.utils.rnn.pack_padded_sequence(x, lengths)

        # Encode all layers
        outputs = [rnn_input]
        for i in range(self.num_layers):
            rnn_input = outputs[-1]

            # Apply dropout to input
            if self.dropout_rate > 0:
                dropout_input = F.dropout(rnn_input.data,
                                          p=self.dropout_rate,
                                          training=self.training)
                rnn_input = nn.utils.rnn.PackedSequence(dropout_input,
                                                        rnn_input.batch_sizes)
            outputs.append(self.rnns[i](rnn_input)[0])

        # Unpack everything
        for i, o in enumerate(outputs[1:], 1):
            outputs[i] = nn.utils.rnn.pad_packed_sequence(o)[0]

        # Concat hidden layers or take final
        if self.concat_layers:
            output = torch.cat(outputs[1:], 2)
        else:
            output = outputs[-1]

        # Transpose and unsort
        output = output.transpose(0, 1)
        output = output.index_select(0, idx_unsort)

        # Dropout on output layer
        if self.dropout_output and self.dropout_rate > 0:
            output = F.dropout(output,
                               p=self.dropout_rate,
                               training=self.training)
        return output
Beispiel #16
0
 def forward(self, input, hidden): 
     # input is (sentence length, batch size) n,bs
     # hidden is ((n_layers,bs,hidden_size),(n_layers,bs,hidden_size))
     embeds = self.embedding(input) # n,bs,300
     # batch goes along the second dimension
     out = F.dropout(embeds,p=dropout_rate)
     out, hidden = self.lstm(out, hidden)
     out = F.dropout(out,p=dropout_rate)
     # apply the linear and the softmax
     out = self.linear(out) # n,bs,|V|
     #out = self.softmax(out)    # This was originally the output. (SG: I see this is LogSoftmax)
     return out, hidden
Beispiel #17
0
    def _forward(self, input_tokens, positions, encoder_out):
        # split and transpose encoder outputs
        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed tokens and positions
        x = self.embed_tokens(input_tokens) + self.embed_positions(positions)
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = self._transpose_unless_incremental_eval(x)

        # temporal convolutions
        avg_attn_scores = None
        num_attn_layers = len(self.attention)
        for proj, conv, attention in zip(self.projections, self.convolutions, self.attention):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = conv.remove_future_timesteps(x)
            x = F.glu(x)

            # attention
            if attention is not None:
                x = self._transpose_unless_incremental_eval(x)

                x, attn_scores = attention(x, target_embedding, (encoder_a, encoder_b))
                attn_scores = attn_scores / num_attn_layers
                if avg_attn_scores is None:
                    avg_attn_scores = attn_scores
                else:
                    avg_attn_scores.add_(attn_scores)

                x = self._transpose_unless_incremental_eval(x)

            # residual
            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = self._transpose_unless_incremental_eval(x)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = self.fc3(x)

        return x, avg_attn_scores
Beispiel #18
0
    def forward(self, xs, hidden, encoder_output, attn_mask=None):
        xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training)
        xes = self.attention(xes, hidden, encoder_output, attn_mask)
        output, new_hidden = self.rnn(xes, hidden)
        # TODO: add post-attention?
        # output = self.attention(output, new_hidden, encoder_output, attn_mask)

        e = F.dropout(self.o2e(output), p=self.dropout, training=self.training)
        scores = F.dropout(self.e2s(e), p=self.dropout, training=self.training)
        # select top scoring index, excluding the padding symbol (at idx zero)
        _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2)
        preds = idx.add_(1)

        return preds, scores, new_hidden
Beispiel #19
0
    def forward(self, x1, x2):
        x1 = F.dropout(F.relu(self.layer1_1(x1.view(-1, 784))), self.drop)
        x2 = F.dropout(F.relu(self.layer1_2(x2.view(-1, 784))), self.drop)

        x = F.dropout(F.relu(self.layer2(torch.cat((x1, x2), 1))), self.drop)
        x = F.dropout(F.relu(self.layer3(x)), self.drop)
        x = F.dropout(F.relu(self.layer4(x)), self.drop)

        out1 = F.relu(self.layer5_1(x))
        out1 = F.sigmoid(self.layer6_1(out1))
        out2 = F.relu(self.layer5_2(x))
        out2 = F.sigmoid(self.layer6_2(out2))

        return out1, out2
Beispiel #20
0
    def forward(self, x, encoder_out, encoder_padding_mask, incremental_state):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
        x, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            mask_future_timesteps=True,
            incremental_state=incremental_state,
            need_weights=False,
        )
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)

        attn = None
        if self.encoder_attn is not None:
            residual = x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                static_kv=True,
                need_weights=(not self.training and self.need_attn),
            )
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = residual + x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)

        residual = x
        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.relu_dropout, training=self.training)
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
        return x, attn
Beispiel #21
0
    def forward(self, text_sequences, text_positions=None, lengths=None,
                speaker_embed=None):
        assert self.n_speakers == 1 or speaker_embed is not None

        # embed text_sequences
        x = self.embed_tokens(text_sequences.long())
        x = F.dropout(x, p=self.dropout, training=self.training)

        # expand speaker embedding for all time steps
        speaker_embed_btc = None

        input_embedding = x

        # B x T x C -> B x C x T
        x = x.transpose(1, 2)

        # 1D conv blocks
        for f in self.convolutions:
            x = f(x, speaker_embed_btc) if isinstance(f, Conv1dGLU) else f(x)

        # Back to B x T x C
        keys = x.transpose(1, 2)

        # scale gradients (this only affects backward, not forward)
        # add output to input embedding for attention
        values = (keys + input_embedding) * math.sqrt(0.5)

        return keys, values
Beispiel #22
0
    def forward(self, emb, hidden):

        ques_feat, hidden = self.ques_rnn(emb, hidden)
        concat_feat = F.dropout(ques_feat[-1], self.d, training=self.training)
        encoder_feat = F.tanh(self.fc1(concat_feat))

        return encoder_feat, hidden
    def forward(self, x):
        x = self.conv1(x)
        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)

        x = self.block1(x)
        x = self.group1(x)
        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)

        x = self.block2(x)
        x = self.group2(x)
        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)

        x = self.block3(x)
        x = self.group3(x)
        x = self.block4(x)
        x = self.group4(x)
        x = F.max_pool2d(x, 2) + F.avg_pool2d(x, 2)

        x = x.view(x.size(0), -1)
        fc = self.fc(x)
        x = F.dropout(fc, training=self.training)
        
        output = list()
        for name, fun in self.fc_dict.iteritems():
            out = fun(x)
            output.append(out)

        return output, fc
Beispiel #24
0
    def forward(self, X, posterior_mean = False):
        """
        Funciton call to generate the output, every time we call it, the dynamic graph is created.
        There can be difference between forward in training and test:
            - In dropout we do not zero neurons in test
            - In Variational Inference we dont randombly sample from the posterior
        
        We create the forward pass by performing operations between the input X (Nsam_batch, Ndim)
        and the parameters of the model that we should have initialized in the __init__
        """
        
        ## We need to sample from the posterior !! 
        self.sample_posterior(posterior_mean)
        
        o1 = self.linear1(X)
#        o1 = torch.mm(X, self.W1) + self.b1
#        print ("x shape: ", X.shape, "W1 shape: ", self.W1.shape, "b1 shape: ", self.b1.shape)
#        print ("o1 shape: ", o1.shape)
#        print ("W2 shape: ", self.W2.shape, "b2 shape: ", self.b2.shape)
        
        ## Apply non-linearity
        o1 = self.cf_a.activation_func(o1)
        o1 = F.dropout(o1,p = self.cf_a.dop, training = self.training)
        o2 = torch.mm(o1, self.W2) + self.b2
#        print ("o2 shape: ", o2.shape)
        return o2
Beispiel #25
0
    def forward(self, xs):
        bsz = len(xs)

        # embed input tokens
        xes = F.dropout(self.lt(xs), p=self.dropout, training=self.training)
        x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
        xes_packed = pack_padded_sequence(xes, x_lens, batch_first=True)

        zeros = self.zeros(xs)
        if zeros.size(1) != bsz:
            zeros.resize_(self.layers * self.dirs, bsz, self.hsz).fill_(0)
        h0 = Variable(zeros, requires_grad=False)

        if type(self.rnn) == nn.LSTM:
            encoder_output_packed, hidden = self.rnn(xes_packed, (h0, h0))
            # take elementwise max between forward and backward hidden states
            hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0],
                      hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0])
        else:
            encoder_output_packed, hidden = self.rnn(xes_packed, h0)

            # take elementwise max between forward and backward hidden states
            hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0]
        encoder_output, _ = pad_packed_sequence(encoder_output_packed,
                                                batch_first=True)
        return encoder_output, hidden
Beispiel #26
0
 def forward(self, input_):
     emb_input = self._embedding(input_)
     conv_in = F.dropout(emb_input.transpose(1, 2),
                         self._dropout, training=self.training)
     output = torch.cat([F.relu(conv(conv_in)).max(dim=2)[0]
                         for conv in self._convs], dim=1)
     return output
Beispiel #27
0
    def forward(self, prev, hidden, enc_outs,
                out=None, enc_att=None, mask=None):
        """
        Parameters:
        -----------

        prev: torch.Tensor (batch x emb_dim),
            Previously decoded output.
        hidden: Used to seed the initial hidden state of the decoder.
            h_t: (enc_num_layers x batch x hid_dim)
            c_t: (enc_num_layers x batch x hid_dim)
        enc_outs: torch.Tensor (seq_len x batch x enc_hid_dim),
            Output of the encoder at the last layer for all encoding steps.
        """
        if self.add_prev:
            # include last out as input for the prediction of the next item
            inp = torch.cat([prev, out or self.init_output_for(hidden)], 1)
        else:
            inp = prev
        out, hidden = self.rnn_step(inp, hidden)
        # out (batch x hid_dim), att_weight (batch x seq_len)
        out, att_weight = self.attn(out, enc_outs, enc_att=enc_att, mask=mask)
        out = F.dropout(out, p=self.dropout, training=self.training)
        if self.has_maxout:
            out = self.maxout(torch.cat([out, prev], 1))
        return out, hidden, att_weight
 def forward(self, x):
     x = self.features(x)
     x = x.view(x.size(0), -1)
     x = self.fc1(x)
     x = F.dropout(x, training=self.training)
     out = self.fc2(x)
     return out, x
    def forward(self, s):
        #                                                           s: batch_size x board_x x board_y
        s = s.view(-1, 1, self.board_x, self.board_y)                # batch_size x 1 x board_x x board_y
        s = F.relu(self.bn1(self.conv1(s)))                          # batch_size x num_channels x board_x x board_y
        s = F.relu(self.bn2(self.conv2(s)))                          # batch_size x num_channels x board_x x board_y
        s = F.relu(self.bn3(self.conv3(s)))                          # batch_size x num_channels x (board_x-2) x (board_y-2)
        s = F.relu(self.bn4(self.conv4(s)))                          # batch_size x num_channels x (board_x-4) x (board_y-4)
        s = s.view(-1, self.args.num_channels*(self.board_x-4)*(self.board_y-4))

        s = F.dropout(F.relu(self.fc_bn1(self.fc1(s))), p=self.args.dropout, training=self.training)  # batch_size x 1024
        s = F.dropout(F.relu(self.fc_bn2(self.fc2(s))), p=self.args.dropout, training=self.training)  # batch_size x 512

        pi = self.fc3(s)                                                                         # batch_size x action_size
        v = self.fc4(s)                                                                          # batch_size x 1

        return F.log_softmax(pi, dim=1), F.tanh(v)
Beispiel #30
0
    def forward(input, hidden, weight):
        assert(len(weight) == total_layers)
        next_hidden = []

        if lstm:
            hidden = list(zip(*hidden))

        for i in range(num_layers):
            all_output = []
            for j, inner in enumerate(inners):
                l = i * num_directions + j

                hy, output = inner(input, hidden[l], weight[l])
                next_hidden.append(hy)
                all_output.append(output)

            input = torch.cat(all_output, input.dim() - 1)

            if dropout != 0 and i < num_layers - 1:
                input = F.dropout(input, p=dropout, training=train, inplace=False)

        if lstm:
            next_h, next_c = zip(*next_hidden)
            next_hidden = (
                torch.cat(next_h, 0).view(total_layers, *next_h[0].size()),
                torch.cat(next_c, 0).view(total_layers, *next_c[0].size())
            )
        else:
            next_hidden = torch.cat(next_hidden, 0).view(
                total_layers, *next_hidden[0].size())

        return next_hidden, input
 def logits(self, x):
     x = self.global_pool(x)
     if self.drop_rate > 0.:
         x = F.dropout(x, p=self.drop_rate, training=self.training)
     x = self.last_linear(x)
     return x
 def forward(self, x, edge_index):
     x = F.relu(self.conv1(x, edge_index, None))
     x = F.dropout(x, training=self.training)
     x = self.conv2(x, edge_index, None)
     return F.log_softmax(x, dim=1)
Beispiel #33
0
    def forward(self, emb_question, question_length, emb_support,
                support_length, unique_word_chars, unique_word_char_length,
                question_words2unique, support_words2unique, word_in_question,
                correct_start, answer2support, is_eval):
        """fast_qa model

        Args:
            emb_question: [Q, L_q, N]
            question_length: [Q]
            emb_support: [Q, L_s, N]
            support_length: [Q]
            unique_word_chars
            unique_word_char_length
            question_words2unique
            support_words2unique
            word_in_question: [Q, L_s]
            correct_start: [A], only during training, i.e., is_eval=False
            answer2question: [A], only during training, i.e., is_eval=False
            is_eval: []

        Returns:
            start_scores [B, L_s, N], end_scores [B, L_s, N], span_prediction [B, 2]
        """
        # Some helpers
        float_tensor = torch.cuda.FloatTensor if emb_question.is_cuda else torch.FloatTensor
        long_tensor = torch.cuda.LongTensor if emb_question.is_cuda else torch.LongTensor
        batch_size = question_length.data.shape[0]
        max_question_length = question_length.max().data[0]
        support_mask = misc.mask_for_lengths(support_length)
        question_binary_mask = misc.mask_for_lengths(question_length,
                                                     mask_right=False,
                                                     value=1.0)

        if self._with_char_embeddings:
            # compute combined embeddings
            [char_emb_question, char_emb_support] = self._conv_char_embedding(
                unique_word_chars, unique_word_char_length,
                [question_words2unique, support_words2unique])

            emb_question = torch.cat([emb_question, char_emb_question], 2)
            emb_support = torch.cat([emb_support, char_emb_support], 2)

        # compute encoder features
        question_features = torch.autograd.Variable(
            torch.ones(batch_size, max_question_length, 2, out=float_tensor()))
        question_features = question_features.type_as(emb_question)

        v_wiqw = self._v_wiq_w
        # [B, L_q, L_s]
        wiq_w = torch.matmul(emb_question * v_wiqw,
                             emb_support.transpose(1, 2))
        # [B, L_q, L_s]
        wiq_w = wiq_w + support_mask.unsqueeze(1)
        wiq_w = F.softmax(wiq_w.view(batch_size * max_question_length,
                                     -1)).view(batch_size, max_question_length,
                                               -1)
        # [B, L_s]
        wiq_w = torch.matmul(question_binary_mask.unsqueeze(1),
                             wiq_w).squeeze(1)

        # [B, L , 2]
        support_features = torch.stack([word_in_question, wiq_w], dim=2)

        if self._with_char_embeddings:
            # highway layer to allow for interaction between concatenated embeddings
            emb_question = self._embedding_projection(emb_question)
            emb_support = self._embedding_projection(emb_support)
            emb_question = self._embedding_highway(emb_question)
            emb_support = self._embedding_highway(emb_support)

        # dropout
        dropout = self._shared_resources.config.get("dropout", 0.0)
        emb_question = F.dropout(emb_question, dropout, training=not is_eval)
        emb_support = F.dropout(emb_support, dropout, training=not is_eval)

        # extend embeddings with features
        emb_question_ext = torch.cat([emb_question, question_features], 2)
        emb_support_ext = torch.cat([emb_support, support_features], 2)

        # encode question and support
        # [B, L, 2 * size]
        encoded_question = self._bilstm(emb_question_ext)[0]
        encoded_support = self._bilstm(emb_support_ext)[0]

        # [B, L, size]
        encoded_support = F.tanh(
            F.linear(encoded_support, self._support_projection))
        encoded_question = F.tanh(
            F.linear(encoded_question, self._question_projection))

        start_scores, end_scores, predicted_start_pointer, predicted_end_pointer = \
            self._answer_layer(encoded_question, question_length, encoded_support, support_length,
                               correct_start, answer2support, is_eval)

        # no multi paragraph support yet
        doc_idx = torch.autograd.Variable(
            torch.zeros(predicted_start_pointer.data.shape[0],
                        out=long_tensor()))
        span = torch.stack(
            [doc_idx, predicted_start_pointer, predicted_end_pointer], 1)

        return start_scores, end_scores, span
Beispiel #34
0
 def forward(self, x):
     x = super(DropoutConv2d, self).forward(x)
     x = F.dropout(x, p=self.drop, training=True)
     return x
Beispiel #35
0
 def test_dropout(self):
     x = torch.randn(3, 4, requires_grad=True)
     self.assertONNX(lambda x: torch.max(functional.dropout(x, training=False)), x)
Beispiel #36
0
 def forward(self, x):
     x = self.fc2(F.dropout(self.bn(self.fc1(x))))
     return x
    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for input feeding/teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for
                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`

        Returns:
            tuple:
                - the last decoder layer's output of shape `(batch, tgt_len,
                  vocab)`
                - the last decoder layer's attention weights of shape `(batch,
                  tgt_len, src_len)`
        """
        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # map oov tokens to unk tokens
        prev_output_tokens = prev_output_tokens.masked_fill(
            prev_output_tokens >= self.embed_tokens.num_embeddings, self.dictionary.unk_index)

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        attn = None

        inner_states = [x]

        # decoder layers
        for layer in self.layers:
            x, attn = layer(
                x,
                encoder_out['encoder_out'] if encoder_out is not None else None,
                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
                incremental_state,
                self_attn_mask=self.buffered_future_mask(x) if incremental_state is None else None,
            )
            inner_states.append(x)
        
        copy_attn, copy_alpha = None, None
        if self.copy_attention:
            assert encoder_out is not None, \
                "--copy-attn can't be used with decoder only architecture"
            x_copy, copy_attn = self.copy_attn_layer(
                query=x,
                key=encoder_out['encoder_out'],
                value=encoder_out['encoder_out'],
                key_padding_mask=encoder_out['encoder_padding_mask'],
                incremental_state=incremental_state,
                static_kv=True,
                need_weights=True,
            )
            x_copy = x_copy.transpose(0, 1)
            copy_alpha = torch.sigmoid(self.copy_alpha_linear(x_copy))
            attn = copy_attn # use copy attn for alignment


        if self.normalize:
            x = self.layer_norm(x) # todo: layer norm

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                x = F.linear(x, self.embed_tokens.weight)
            else:
                x = F.linear(x, self.embed_out)
        
        return x, {'attn': attn, 'inner_states': inner_states, 
            'copy_attn': copy_attn, 'copy_alpha': copy_alpha, 'src_tokens': encoder_out['src_tokens']}
Beispiel #38
0
    def forward(self, input_v, input_q):
        # input_v和input_q的维度都是2, (batchsize, d_v) ,统一输入图像和问题的维度
        if input_v.dim() != input_q.dim() and input_v.dim() != 2:
            raise ValueError
        batch_size = input_v.size(0)

        # 分别处理图像和问题嵌入
        # dropout-->linear(d_v2048/d_q2400--》310)-->tanh
        if self.visual_embedding:
            x_v = F.dropout(input_v,
                            p=self.opt['dropout_v'],
                            training=self.training)
            x_v = self.linear_v(x_v)
            if 'activation_v' in self.opt:
                x_v = getattr(F, self.opt['activation_v'])(x_v)
        else:
            x_v = input_v

        if self.question_embedding:
            x_q = F.dropout(input_q,
                            p=self.opt['dropout_q'],
                            training=self.training)
            x_q = self.linear_q(x_q)
            if 'activation_q' in self.opt:
                x_q = getattr(F, self.opt['activation_q'])(x_q)
        else:
            x_q = input_q

        # 秩R的约束,(论文中)Z表示成R个Zr的总和(Z会投影到预测空间y上)。
        # 处理后的图像和问题,使用了对应位的相乘,
        # 使用堆叠求和方式进行相加,最终得到的x_mm相当于文章的Z
        x_mm = []
        for i in range(self.opt['R']):  # 用for循环对R个映射独立的进行映射,存储到x_mm

            # 分别处理,图像和问题嵌入
            # dropout-->linear(310--》510)-->tanh

            x_hv = F.dropout(x_v,
                             p=self.opt['dropout_hv'],
                             training=self.training)
            x_hv = self.list_linear_hv[i](x_hv)  # linear后大小变510,
            if 'activation_hv' in self.opt:  # tanh
                x_hv = getattr(F, self.opt['activation_hv'])(x_hv)

            x_hq = F.dropout(x_q,
                             p=self.opt['dropout_hq'],
                             training=self.training)
            x_hq = self.list_linear_hq[i](x_hq)
            if 'activation_hq' in self.opt:
                x_hq = getattr(F, self.opt['activation_hq'])(x_hq)

            #
            x_mm.append(torch.mul(x_hq,
                                  x_hv))  # 使用mul()对应位相乘进行融合,这样融合之后大小不变,但是有R个

        # x_mm([batchsize,510],,,,R个,,,[batchsize,510]),
        x_mm = torch.stack(x_mm, dim=1)  # R个,,在维度1堆起来,
        x_mm = x_mm.sum(1).view(
            batch_size, self.opt['dim_mm'])  # dim1求和,恢复原来大小(batchsize,510)

        if 'activation_mm' in self.opt:
            x_mm = getattr(F, self.opt['activation_mm'])(
                x_mm)  # activation_mm = softmax

        # 这就是模型的输出,output,用来预测答案。
        return x_mm
Beispiel #39
0
    def forward(self, src_tokens, src_lengths):
        """
        Args:
            src_tokens (LongTensor): tokens in the source language of shape
                `(batch, src_len)`
            src_lengths (LongTensor): lengths of each source sentence of shape
                `(batch)`

        Returns:
            dict:
                - **encoder_out** (tuple): a tuple with two elements, where the
                  first element is the last encoder layer's output and the
                  second element is the same quantity summed with the input
                  embedding (used for attention). The shape of both tensors is
                  `(batch, src_len, embed_dim)`.
                - **encoder_padding_mask** (ByteTensor): the positions of
                  padding elements of shape `(batch, src_len)`
        """
        # embed tokens and positions
        x = self.embed_tokens(src_tokens) + self.embed_positions(src_tokens)
        x = F.dropout(x, p=self.dropout, training=self.training)
        input_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # used to mask padding in input
        encoder_padding_mask = src_tokens.eq(self.padding_idx).t()  # -> T x B
        if not encoder_padding_mask.any():
            encoder_padding_mask = None

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        residuals = [x]
        # temporal convolutions
        for proj, conv, res_layer in zip(self.projections, self.convolutions,
                                         self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            if encoder_padding_mask is not None:
                x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

            x = F.dropout(x, p=self.dropout, training=self.training)
            if conv.kernel_size[0] % 2 == 1:
                # padding is implicit in the conv
                x = conv(x)
            else:
                padding_l = (conv.kernel_size[0] - 1) // 2
                padding_r = conv.kernel_size[0] // 2
                x = F.pad(x, (0, 0, 0, 0, padding_l, padding_r))
                x = conv(x)
            x = F.glu(x, dim=2)

            if residual is not None:
                x = (x + residual) * math.sqrt(0.5)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = x.transpose(1, 0)

        # project back to size of embedding
        x = self.fc2(x)

        if encoder_padding_mask is not None:
            encoder_padding_mask = encoder_padding_mask.t()  # -> B x T
            x = x.masked_fill(encoder_padding_mask.unsqueeze(-1), 0)

        # scale gradients (this only affects backward, not forward)
        x = GradMultiply.apply(x, 1.0 / (2.0 * self.num_attention_layers))

        # add output to input embedding for attention
        y = (x + input_embedding) * math.sqrt(0.5)

        return {
            'encoder_out': (x, y),
            'encoder_padding_mask': encoder_padding_mask,  # B x T
        }
Beispiel #40
0
 def forward(self, x):
     x = self.fc2(x)
     x = F.relu(x)
     x = F.dropout(x)
     x = self.fc3(x)
     return F.log_softmax(x)
    def forward(self,
                query,
                key,
                value,
                key_padding_mask=None,
                incremental_state=None,
                need_weights=True,
                static_kv=False,
                self_attn_mask=None,
                ngram_mask_matrix=None,
                i_buckets_main_stream=None,
                i_bucket_relative_stream=None,
                real_positions=None):
        """Input shape: Time x Batch x Channel

        Timesteps can be masked by supplying a T x T mask in the
        `attn_mask` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if 'prev_key' in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert (self.encoder_decoder_attention
                            and not self.self_attention)
                    key = value = None
        else:
            saved_state = None

        q, k, v = self.in_proj_qkv(query)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
        q = q.contiguous().view(tgt_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)
        if k is not None:
            k = k.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)
        if v is not None:
            v = v.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)

        # input hidden states
        h_list = query.chunk(1 + self.ngram, dim=0)

        q_list = q.chunk(1 + self.ngram, dim=1)
        k_list = k.chunk(1 + self.ngram, dim=1)
        v_list = v.chunk(1 + self.ngram, dim=1)

        h_main, h_predict_list = h_list[0], h_list[1:]
        q_main, q_predict_list = q_list[0], q_list[1:]
        k_main, k_predict_list = k_list[0], k_list[1:]
        v_main, v_predict_list = v_list[0], v_list[1:]

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len,
            # head_dim)
            if 'prev_key' in saved_state:
                prev_key = saved_state['prev_key'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    assert False, 'static_kv not supprt in ngram decoder'
                    k = prev_key
                else:
                    k_main = torch.cat((prev_key, k_main), dim=1)
            if 'prev_value' in saved_state:
                prev_value = saved_state['prev_value'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    v = prev_value
                else:
                    v_main = torch.cat((prev_value, v_main), dim=1)
            saved_state['prev_key'] = k_main.view(bsz, self.num_heads, -1,
                                                  self.head_dim)
            saved_state['prev_value'] = v_main.view(bsz, self.num_heads, -1,
                                                    self.head_dim)

            self._set_input_buffer(incremental_state, saved_state)

        real_tgt_len = tgt_len // (1 + self.ngram)

        attn_weights_main = torch.bmm(q_main, k_main.transpose(1, 2))

        main_relative_logits = self.main_stream_relative_logits(
            h_main, attn_weights_main, real_positions, i_buckets_main_stream)
        attn_weights_main = attn_weights_main + main_relative_logits

        if self_attn_mask is not None:
            self_attn_mask = self_attn_mask.unsqueeze(0)
            attn_weights_main = attn_weights_main + self_attn_mask

        attn_weights_main = utils.softmax(
            attn_weights_main,
            dim=-1,
            onnx_trace=self.onnx_trace,
        ).type_as(attn_weights_main)
        attn_weights_main = F.dropout(attn_weights_main,
                                      p=self.dropout,
                                      training=self.training)

        attn_main = torch.bmm(attn_weights_main, v_main)
        attn_main = attn_main.transpose(0, 1).contiguous().view(
            1, real_tgt_len, bsz, embed_dim)
        attn_main = self.out_proj(attn_main)

        # [ngram, B*head, T, c]
        q_ngram = torch.cat(q_predict_list,
                            0).view(self.ngram, -1, real_tgt_len,
                                    self.head_dim)
        # [ngram, B*head, 2*T, c]
        k_ngram = torch.cat([
            torch.cat([k_main, k_p], 1).unsqueeze(0) for k_p in k_predict_list
        ], 0)

        # below code slower than above for loop
        # k_ngram = torch.cat([k_main.unsqueeze(0).repeat(self.ngram, 1, 1, 1),
        #                      torch.cat(k_predict_list).view(
        #                          self.ngram, -1, real_tgt_len, self.head_dim)
        #                     ], 2)

        # [ngram, T, B, C]
        h_ngram = torch.cat(h_predict_list, 0).view(self.ngram, real_tgt_len,
                                                    bsz, embed_dim)

        # [ngram, B*head, 2*T, c]
        v_ngram = torch.cat([
            torch.cat([v_main, v_p], 1).unsqueeze(0) for v_p in v_predict_list
        ], 0)
        # below code slower than above for loop
        # v_ngram = torch.cat([v_main.unsqueeze(0).repeat(self.ngram, 1, 1, 1),
        #                      torch.cat(v_predict_list).view(self.ngram, -1,
        #                      real_tgt_len, self.head_dim)], 2)

        # [ngram, B*head, T, 2*T]
        attn_weights_ngram = torch.einsum('nbtc,nbsc->nbts',
                                          (q_ngram, k_ngram))

        # [ngram, B*head, T, S]
        predict_relative_logits = self.ngram_relative_logits(
            h_ngram, attn_weights_ngram, real_positions,
            i_bucket_relative_stream)
        # [ngram, B*head, T, 2*T]
        attn_weights_ngram = attn_weights_ngram + predict_relative_logits

        if ngram_mask_matrix is not None:
            ngram_mask_matrix = ngram_mask_matrix.unsqueeze(1)
            attn_weights_ngram = attn_weights_ngram + ngram_mask_matrix

        attn_weights_ngram = utils.softmax(
            attn_weights_ngram,
            dim=-1,
            onnx_trace=self.onnx_trace,
        ).type_as(attn_weights_ngram)
        attn_weights_ngram = F.dropout(attn_weights_ngram,
                                       p=self.dropout,
                                       training=self.training)

        # [ngram, B*head, T, c]
        attn_ngram = torch.einsum('nbts,nbsc->nbtc',
                                  (attn_weights_ngram, v_ngram))
        # [ngram, T, B, C]
        attn_ngram = attn_ngram.transpose(1, 2).contiguous().view(
            self.ngram, real_tgt_len, bsz, embed_dim)
        attn_ngram = self.out_proj(attn_ngram)

        attn_result = []
        attn_result.append(attn_main)
        attn_result.append(attn_ngram)

        # [1+ngram*T, B, C]
        attn = torch.cat(attn_result, 0).view(-1, bsz, embed_dim)
        return attn, None
Beispiel #42
0
    def fwd(self, x, lengths, causal, src_enc=None, src_len=None, positions=None, langs=None, cache=None):
        """
        Inputs:
            `x` LongTensor(slen, bs), containing word indices
            `lengths` LongTensor(bs), containing the length of each sentence
            `causal` Boolean, if True, the attention is only done over previous hidden states
            `positions` LongTensor(slen, bs), containing word positions
            `langs` LongTensor(slen, bs), containing language IDs
        """
        # lengths = (x != self.pad_index).float().sum(dim=1)
        # mask = x != self.pad_index

        # check inputs
        slen, bs = x.size()
        assert lengths.size(0) == bs
        assert lengths.max().item() <= slen
        x = x.transpose(0, 1)  # batch size as dimension 0
        assert (src_enc is None) == (src_len is None)
        if src_enc is not None:
            assert self.is_decoder
            assert src_enc.size(0) == bs

        # generate masks
        mask, attn_mask = get_masks(slen, lengths, causal)
        if self.is_decoder and src_enc is not None:
            src_mask = torch.arange(src_len.max(), dtype=torch.long, device=lengths.device) < src_len[:, None]

        # positions
        if positions is None:
            positions = x.new(slen).long()
            positions = torch.arange(slen, out=positions).unsqueeze(0)
        else:
            assert positions.size() == (slen, bs)
            positions = positions.transpose(0, 1)

        # langs
        if langs is not None:
            assert langs.size() == (slen, bs)
            langs = langs.transpose(0, 1)

        # do not recompute cached elements
        if cache is not None:
            _slen = slen - cache['slen']
            x = x[:, -_slen:]
            positions = positions[:, -_slen:]
            if langs is not None:
                langs = langs[:, -_slen:]
            mask = mask[:, -_slen:]
            attn_mask = attn_mask[:, -_slen:]

        # embeddings
        tensor = self.embeddings(x)
        tensor = tensor + self.position_embeddings(positions).expand_as(tensor)
        if langs is not None and self.use_lang_emb:
            tensor = tensor + self.lang_embeddings(langs)
        tensor = self.layer_norm_emb(tensor)
        tensor = F.dropout(tensor, p=self.dropout, training=self.training)
        tensor *= mask.unsqueeze(-1).to(tensor.dtype)

        # transformer layers
        for i in range(self.n_layers):

            # self attention
            attn = self.attentions[i](tensor, attn_mask, cache=cache)
            attn = F.dropout(attn, p=self.dropout, training=self.training)
            tensor = tensor + attn
            tensor = self.layer_norm1[i](tensor)

            # encoder attention (for decoder only)
            if self.is_decoder and src_enc is not None:
                attn = self.encoder_attn[i](tensor, src_mask, kv=src_enc, cache=cache)
                attn = F.dropout(attn, p=self.dropout, training=self.training)
                tensor = tensor + attn
                tensor = self.layer_norm15[i](tensor)

            # FFN
            if ('%i_in' % i) in self.memories:
                tensor = tensor + self.memories['%i_in' % i](tensor)
            else:
                tensor = tensor + self.ffns[i](tensor)
            tensor = self.layer_norm2[i](tensor)

            # memory
            if ('%i_after' % i) in self.memories:
                tensor = tensor + self.memories['%i_after' % i](tensor)
            # TODO: add extra layer norm here?

            tensor *= mask.unsqueeze(-1).to(tensor.dtype)

        # update cache length
        if cache is not None:
            cache['slen'] += tensor.size(1)

        # move back sequence length to dimension 0
        tensor = tensor.transpose(0, 1)

        return tensor
Beispiel #43
0
 def forward(self, x):
     x = F.dropout(F.relu(self.bn1(self.fc1(x))), training=self.training)
     x = F.dropout(F.relu(self.bn2(self.fc2(x))), training=self.training)
     x = self.fc3(x)
     return x
    def forward(self, x):
        N=self.N
        conv1=self.conv1_1(x)
        conv1=F.relu(conv1,inplace=True)
        conv1=self.conv1_2(conv1)
        conv1=F.relu(conv1,inplace=True)
        pool1=F.max_pool2d(conv1,(2,2))

        conv2=self.conv2_1(pool1)
        conv2=F.relu(conv2,inplace=True)
        conv2=self.conv2_2(conv2)
        conv2=F.relu(conv2,inplace=True)
        pool2=F.max_pool2d(conv2,(2,2))

        conv3=self.conv3_1(pool2)
        conv3=F.relu(conv3,inplace=True)
        conv3=self.conv3_2(conv3)
        conv3=F.relu(conv3,inplace=True)
        conv3=F.dropout(conv3,p=0.5, training=self.training)
        pool3=F.max_pool2d(conv3,(2,2))

        #D1
        conv4_1=self.conv4_1_1(pool3)
        conv4_1=F.relu(conv4_1,inplace=True)
        conv4_1=self.conv4_1_2(conv4_1)
        conv4_1=F.relu(conv4_1,inplace=True)
        conv4_1=F.dropout(conv4_1,p=0.5,training=self.training)

        #D2
        conv4_2=self.conv4_2_1(conv4_1)
        conv4_2=F.relu(conv4_2,inplace=True)
        conv4_2=self.conv4_2_2(conv4_2)
        conv4_2=F.relu(conv4_2,inplace=True)
        conv4_2=F.dropout(conv4_2,p=0.5,training=self.training)

        #D3
        merge_dense=torch.cat([conv4_2,conv4_1],dim=1)
        conv4_3=self.conv4_3_1(merge_dense)
        conv4_3=F.relu(conv4_3,inplace=True)
        conv4_3=self.conv4_3_2(conv4_3)
        conv4_3=F.relu(conv4_3,inplace=True)
        conv4_3=F.dropout(conv4_3,p=0.5,training=self.training)

        up6=self.upconv1(conv4_3)
        up6=self.upconv1_conv(up6)
        up6=self.BN1(up6)
        up6=F.relu(up6,inplace=True)
        x1=torch.reshape(conv3,(1,-1,256,int(N/4),int(N/4)))
        x2 = torch.reshape(up6, (1,-1, 256, int(N/4), int(N/4)))
        merge6=torch.cat([x2,x1],dim=0)
        merge6 = self.convlstm1(merge6)
        conv6=self.conv6_1(merge6)
        conv6=F.relu(conv6,inplace=True)
        conv6=self.conv6_2(conv6)
        conv6=F.relu(conv6,inplace=True)

        up7=self.upconv2(conv6)
        up7=self.upconv2_conv(up7)
        up7=self.BN2(up7)
        up7=F.relu(up7,inplace=True)

        x1 = torch.reshape(conv2, (1, -1, 128, int(N / 2), int(N / 2)))
        x2 = torch.reshape(up7, (1, -1, 128, int(N / 2), int(N / 2)))
        merge7 = torch.cat([x2, x1], dim=0)
        merge7 =self.convlstm2(merge7)

        conv7 = self.conv7_1(merge7)
        conv7 = F.relu(conv7,inplace=True)
        conv7 = self.conv7_2(conv7)
        conv7 = F.relu(conv7,inplace=True)

        up8 = self.upconv3(conv7)
        up8=self.upconv3_conv(up8)
        up8 = self.BN3(up8)
        up8 = F.relu(up8,inplace=True)

        x1 = torch.reshape(conv1, (1, -1, 64, N, N))
        x2 = torch.reshape(up8, (1, -1, 64, N, N))
        merge8 = torch.cat([x2, x1], dim=0)
        merge8 = self.convlstm3(merge8)

        conv8=self.conv8_1(merge8)
        conv8=F.relu(conv8,inplace=True)
        conv8 = self.conv8_2(conv8)
        conv8 = F.relu(conv8,inplace=True)
        conv8 = self.conv8_3(conv8)
        conv8 = F.relu(conv8,inplace=True)
        conv9=self.conv9(conv8)
        conv9=torch.sigmoid(conv9)

        return conv9
    def forward(self, x, encoder_out, encoder_padding_mask, incremental_state,
                prev_self_attn_state=None, prev_attn_state=None, self_attn_mask=None,
                self_attn_padding_mask=None):
        """
        Args:
            x (Tensor): input to the layer of shape `(seq_len, batch, embed_dim)`
            encoder_padding_mask (ByteTensor): binary ByteTensor of shape
                `(batch, src_len)` where padding elements are indicated by ``1``.

        Returns:
            encoded output of shape `(batch, src_len, embed_dim)`
        """
        residual = x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, before=True)
        if prev_self_attn_state is not None:
            if incremental_state is None:
                incremental_state = {}
            prev_key, prev_value = prev_self_attn_state
            saved_state = {"prev_key": prev_key, "prev_value": prev_value}
            self.self_attn._set_input_buffer(incremental_state, saved_state)
        x, _ = self.self_attn(
            query=x,
            key=x,
            value=x,
            key_padding_mask=self_attn_padding_mask,
            incremental_state=incremental_state,
            need_weights=False,
            attn_mask=self_attn_mask,
        )
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.self_attn_layer_norm, x, after=True)

        attn = None
        if self.encoder_attn is not None:
            residual = x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, before=True)
            if prev_attn_state is not None:
                if incremental_state is None:
                    incremental_state = {}
                prev_key, prev_value = prev_attn_state
                saved_state = {"prev_key": prev_key, "prev_value": prev_value}
                self.encoder_attn._set_input_buffer(incremental_state, saved_state)
            x, attn = self.encoder_attn(
                query=x,
                key=encoder_out,
                value=encoder_out,
                key_padding_mask=encoder_padding_mask,
                incremental_state=incremental_state,
                static_kv=True,
                need_weights=(not self.training and self.need_attn),
            )
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = residual + x
            x = self.maybe_layer_norm(self.encoder_attn_layer_norm, x, after=True)

        residual = x
        x = self.maybe_layer_norm(self.final_layer_norm, x, before=True)
        x = F.relu(self.fc1(x))
        x = F.dropout(x, p=self.relu_dropout, training=self.training)
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        x = residual + x
        x = self.maybe_layer_norm(self.final_layer_norm, x, after=True)
        if self.onnx_trace:
            saved_state = self.self_attn._get_input_buffer(incremental_state)
            self_attn_state = saved_state["prev_key"], saved_state["prev_value"]
            return x, attn, self_attn_state
        return x, attn
Beispiel #46
0
 def forward(self, input):
     x = self.lin1(input)
     x = self.act(x)
     x = self.lin2(x)
     x = F.dropout(x, p=self.dropout, training=self.training)
     return x
Beispiel #47
0
 def forward(self, x):
     for linear in self.layers:
         x = F.dropout(F.relu(linear(x)), p=0.5, training=True)
     return x
    def forward(self,
                query,
                key,
                value,
                mask_future_timesteps=False,
                key_padding_mask=None,
                incremental_state=None,
                need_weights=True,
                static_kv=False):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Future timesteps can be masked with the
        `mask_future_timesteps` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """

        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
        kv_same = key.data_ptr() == value.data_ptr()

        tgt_len, bsz, embed_dim = query.size()
        if embed_dim != self.embed_dim:
            print("| x: {}, multi_head: {}".format(embed_dim, self.embed_dim))
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]
        assert key.size() == value.size()

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if 'prev_key' in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert kv_same and not qkv_same
                    key = value = None
        else:
            saved_state = None

        if qkv_same:
            # self-attention
            q, k, v = self.in_proj_qkv(query)
        elif kv_same:
            # encoder-decoder attention
            q = self.in_proj_q(query)
            if key is None:
                assert value is None
                # this will allow us to concat it with previous value and get
                # just get the previous value
                k = v = q.new(0)
            else:
                k, v = self.in_proj_kv(key)
        else:
            q = self.in_proj_q(query)
            k = self.in_proj_k(key)
            v = self.in_proj_v(value)
        q *= self.scaling

        if saved_state is not None:
            if 'prev_key' in saved_state:
                k = torch.cat((saved_state['prev_key'], k), dim=0)
            if 'prev_value' in saved_state:
                v = torch.cat((saved_state['prev_value'], v), dim=0)
            saved_state['prev_key'] = k
            saved_state['prev_value'] = v
            self._set_input_buffer(incremental_state, saved_state)

        src_len = k.size(0)

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        q = q.contiguous().view(tgt_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)
        k = k.contiguous().view(src_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)
        v = v.contiguous().view(src_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)

        attn_weights = torch.bmm(q, k.transpose(1, 2))
        assert list(
            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        # only apply masking at training time (when incremental state is None)
        if mask_future_timesteps and incremental_state is None:
            assert query.size() == key.size(), \
                'mask_future_timesteps only applies to self-attention'
            attn_weights += self.buffered_mask(attn_weights).unsqueeze(0)
        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights.float().masked_fill(
                key_padding_mask.unsqueeze(1).unsqueeze(2),
                float('-inf'),
            ).type_as(attn_weights)  # FP16 support: cast to float and back
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        # Local attention
        pos_diff = torch.abs(
            torch.arange(tgt_len).unsqueeze(1) -
            torch.arange(src_len).unsqueeze(0)).float().cuda()
        variance = self.vars if hasattr(self, 'vars') else None
        local_mask = self.penalty(pos_diff, bsz, variance)
        attn_weights = attn_weights - local_mask

        attn_weights = F.softmax(attn_weights.float(),
                                 dim=-1).type_as(attn_weights)
        attn_weights = F.dropout(attn_weights,
                                 p=self.dropout,
                                 training=self.training)

        attn = torch.bmm(attn_weights, v)
        assert list(
            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]
        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)

        if need_weights:
            # average attention weights over heads
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights.sum(dim=1) / self.num_heads
        else:
            attn_weights = None

        return attn, attn_weights
Beispiel #49
0
 def forward(self, x):
     x = self.conv1(x)
     # add_scalar
     x = x + 3
     # mul_scalar
     x = x * 3
     # add_scalar_out
     x += 3
     # mul_scalar_out
     x *= 3
     # add_scalar_relu
     x = x + 3
     x = F.relu(x)
     # add_scalar_relu_out
     x += 3
     x = F.relu(x)
     # mul_scalar_relu
     x = x * 3
     x = F.relu(x)
     # mul_scalar_relu_out
     x *= 3
     x = F.relu(x)
     x = self.maxpool1d(x)
     x = self.maxpool2d(x)
     x = self.maxpool3d(x)
     x = torch.flatten(x)
     x = torch.max(x)
     x = torch.min(x)
     x = x.reshape([-1])
     x = x.resize_(1, 1, x.numel())
     x = x.view(-1)
     # prim::ListConstruct
     xs = [x, x]
     # prim::ListUnpack
     x, y = xs
     # prim::TupleConstruct
     xs = (x, x)
     # prim::TupleUnpack
     x, y = xs
     x = x.transpose(1, 2)
     x = x.contiguous()
     x, y = torch.chunk(x, 2)
     x = F.dropout(x)
     x = self.dropout(x)
     x, _ = torch.sort(x)
     x = x.permute(0, 2, 3, 1)
     x = x.repeat_interleave(3, 1)
     x = torch.repeat_interleave(x, 3, 1)
     x = self.relu(x)
     x = F.relu(x)
     x = F.relu(x, inplace=True)
     x = x.relu()
     x.relu_()
     x = x.squeeze(0)
     x.squeeze_(0)
     x = torch.squeeze(x, 0)
     x = x.unsqueeze(0)
     x.unsqueeze_(0)
     x = torch.unsqueeze(x, 0)
     x = x.detach()
     x.detach_()
     x = x.repeat(4, 2)
     y = []
     y.append(x)
     z = torch.stack(y, 0)
     z = [z, z]
     x, _ = z
     x = self.conv2(x)
     return x
Beispiel #50
0
 def forward(self, x):
     if self.transform_input:
         x_ch0 = torch.unsqueeze(x[:, 0],
                                 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5
         x_ch1 = torch.unsqueeze(x[:, 1],
                                 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5
         x_ch2 = torch.unsqueeze(x[:, 2],
                                 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5
         x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
     # N x 3 x 299 x 299
     x = self.Conv2d_1a_3x3(x)
     # N x 32 x 149 x 149
     x = self.Conv2d_2a_3x3(x)
     # N x 32 x 147 x 147
     x = self.Conv2d_2b_3x3(x)
     # N x 64 x 147 x 147
     x = self.MaxPool2b(x)
     # N x 64 x 73 x 73
     x = self.Conv2d_3b_1x1(x)
     # N x 80 x 73 x 73
     x = self.Conv2d_4a_3x3(x)
     # N x 192 x 71 x 71
     x = self.MaxPool4a(x)
     # N x 192 x 35 x 35
     x = self.Mixed_5b(x)
     # N x 256 x 35 x 35
     x = self.Mixed_5c(x)
     # N x 288 x 35 x 35
     x = self.Mixed_5d(x)
     # N x 288 x 35 x 35
     x = self.Mixed_6a(x)
     # N x 768 x 17 x 17
     x = self.Mixed_6b(x)
     # N x 768 x 17 x 17
     x = self.Mixed_6c(x)
     # N x 768 x 17 x 17
     x = self.Mixed_6d(x)
     # N x 768 x 17 x 17
     x = self.Mixed_6e(x)
     # N x 768 x 17 x 17
     aux_defined = self.training and self.aux_logits
     if aux_defined:
         aux = self.AuxLogits(x)
     else:
         aux = None
     # N x 768 x 17 x 17
     x = self.Mixed_7a(x)
     # N x 1280 x 8 x 8
     x = self.Mixed_7b(x)
     # N x 2048 x 8 x 8
     x = self.Mixed_7c(x)
     # N x 2048 x 8 x 8
     # Adaptive average pooling
     x = self.AvgPool(x)
     # N x 2048 x 1 x 1
     x = F.dropout(x, training=self.training)
     # N x 2048 x 1 x 1
     x = torch.flatten(x, 1)
     # N x 2048
     x = self.fc(x)
     # N x 1000 (num_classes)
     return self.eager_outputs(x, aux)
Beispiel #51
0
 def forward(self, x):
     out = F.relu(self.fc1(x))
     out = F.dropout(out, self.dropout)
     out = self.fc2(out)
     return out
    def forward(self,
                query,
                key,
                value,
                key_padding_mask=None,
                incremental_state=None,
                need_weights=True,
                static_kv=False,
                attn_mask=None):
        """Input shape: Time x Batch x Channel

        Self-attention can be implemented by passing in the same arguments for
        query, key and value. Timesteps can be masked by supplying a T x T mask in the
        `attn_mask` argument. Padding elements can be excluded from
        the key by passing a binary ByteTensor (`key_padding_mask`) with shape:
        batch x src_len, where padding elements are indicated by 1s.
        """

        qkv_same = query.data_ptr() == key.data_ptr() == value.data_ptr()
        kv_same = key.data_ptr() == value.data_ptr()

        tgt_len, bsz, embed_dim = query.size()
        assert embed_dim == self.embed_dim
        assert list(query.size()) == [tgt_len, bsz, embed_dim]
        assert key.size() == value.size()

        if incremental_state is not None:
            saved_state = self._get_input_buffer(incremental_state)
            if 'prev_key' in saved_state:
                # previous time steps are cached - no need to recompute
                # key and value if they are static
                if static_kv:
                    assert kv_same and not qkv_same
                    key = value = None
        else:
            saved_state = None

        if qkv_same:
            # self-attention
            q, k, v = self.in_proj_qkv(query)
        elif kv_same:
            # encoder-decoder attention
            q = self.in_proj_q(query)
            if key is None:
                assert value is None
                k = v = None
            else:
                k, v = self.in_proj_kv(key)
        else:
            q = self.in_proj_q(query)
            k = self.in_proj_k(key)
            v = self.in_proj_v(value)
        q *= self.scaling

        if self.bias_k is not None:
            assert self.bias_v is not None
            k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)])
            v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)])
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat([
                    key_padding_mask,
                    key_padding_mask.new_zeros(key_padding_mask.size(0), 1)
                ],
                                             dim=1)

        q = q.contiguous().view(tgt_len, bsz * self.num_heads,
                                self.head_dim).transpose(0, 1)
        if k is not None:
            k = k.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)
        if v is not None:
            v = v.contiguous().view(-1, bsz * self.num_heads,
                                    self.head_dim).transpose(0, 1)

        if saved_state is not None:
            # saved states are stored with shape (bsz, num_heads, seq_len, head_dim)
            if 'prev_key' in saved_state:
                prev_key = saved_state['prev_key'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    k = prev_key
                else:
                    k = torch.cat((prev_key, k), dim=1)
            if 'prev_value' in saved_state:
                prev_value = saved_state['prev_value'].view(
                    bsz * self.num_heads, -1, self.head_dim)
                if static_kv:
                    v = prev_value
                else:
                    v = torch.cat((prev_value, v), dim=1)
            saved_state['prev_key'] = k.view(bsz, self.num_heads, -1,
                                             self.head_dim)
            saved_state['prev_value'] = v.view(bsz, self.num_heads, -1,
                                               self.head_dim)

            self._set_input_buffer(incremental_state, saved_state)

        src_len = k.size(1)

        if key_padding_mask is not None:
            assert key_padding_mask.size(0) == bsz
            assert key_padding_mask.size(1) == src_len

        if self.add_zero_attn:
            src_len += 1
            k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])],
                          dim=1)
            v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])],
                          dim=1)
            if attn_mask is not None:
                attn_mask = torch.cat(
                    [attn_mask,
                     attn_mask.new_zeros(attn_mask.size(0), 1)],
                    dim=1)
            if key_padding_mask is not None:
                key_padding_mask = torch.cat([
                    key_padding_mask,
                    torch.zeros(key_padding_mask.size(0),
                                1).type_as(key_padding_mask)
                ],
                                             dim=1)

        attn_weights = torch.bmm(q, k.transpose(1, 2))
        assert list(
            attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len]

        if attn_mask is not None:
            attn_mask = attn_mask.unsqueeze(0)
            if self.onnx_trace:
                attn_mask = attn_mask.repeat(attn_weights.size(0), 1, 1)
            attn_weights += attn_mask

        if key_padding_mask is not None:
            # don't attend to padding symbols
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            if self.onnx_trace:
                attn_weights = torch.where(
                    key_padding_mask.unsqueeze(1).unsqueeze(2),
                    torch.Tensor([float("-Inf")]),
                    attn_weights.float()).type_as(attn_weights)
            else:
                attn_weights = attn_weights.float().masked_fill(
                    key_padding_mask.unsqueeze(1).unsqueeze(2),
                    float('-inf'),
                ).type_as(attn_weights)  # FP16 support: cast to float and back
            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len,
                                             src_len)

        attn_weights = F.softmax(attn_weights.float(),
                                 dim=-1).type_as(attn_weights)
        attn_weights = F.dropout(attn_weights,
                                 p=self.dropout,
                                 training=self.training)

        attn = torch.bmm(attn_weights, v)
        assert list(
            attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim]

        if (self.onnx_trace and attn.size(1) == 1):
            # when ONNX tracing a single decoder step (sequence length == 1)
            # the transpose is a no-op copy before view, thus unnecessary
            attn = attn.contiguous().view(tgt_len, bsz, embed_dim)
        else:
            attn = attn.transpose(0,
                                  1).contiguous().view(tgt_len, bsz, embed_dim)
        attn = self.out_proj(attn)

        if need_weights:
            # average attention weights over heads
            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len,
                                             src_len)
            attn_weights = attn_weights.sum(dim=1) / self.num_heads
        else:
            attn_weights = None

        return attn, attn_weights
    def forward(self, data, fold_id, mode):
        x, adj = data[:2]
        x_ori = x
        random_walks = data[6]
        random_walks = random_walks.reshape(
            random_walks.size()[0] * random_walks.size()[1],
            random_walks.size()[2],
            random_walks.size()[3])

        # Node random walk model feature extractor
        node_random_walk_model = self.node_random_walk_models[fold_id]
        if mode == 'train':
            node_random_walk_model.train()
        else:
            node_random_walk_model.eval()
        LSTM_feature_extractor = torch.nn.Sequential(
            *list(node_random_walk_model.children())[:-2])
        LSTM_feature_extractor_output = LSTM_feature_extractor(random_walks)

        x = torch.mean(LSTM_feature_extractor_output[0], dim=1).squeeze()
        x = x.reshape(adj.size()[0], adj.size()[1], x.size()[-1])

        x = readout_function(x, self.readout)

        node_random_walk_model_feature_extractor = torch.nn.Sequential(
            *list(node_random_walk_model.children())[1:-1])
        node_random_walk_model_feature_extractor_output = F.relu(
            node_random_walk_model_feature_extractor(x))

        # Spatial graph embedding model feature extractor
        if self.spatial_graph_embedding_model_name == 'GCN':
            spatial_graph_embedding_model = self.spatial_graph_embedding_models[
                fold_id]
            if mode == 'train':
                spatial_graph_embedding_model.train()
            else:
                spatial_graph_embedding_model.eval()
            for i in range(self.n_spatial_graph_embedding_model_layer):
                layers_feature_extractor = spatial_graph_embedding_model.graph_convolution_layers[
                    i]
                if i == 0:
                    layers_feature_extractor_output = F.relu(
                        layers_feature_extractor((x_ori), (adj)))
                else:
                    layers_feature_extractor_output = F.relu(
                        layers_feature_extractor(
                            (layers_feature_extractor_output), (adj)))

                if i != self.n_spatial_graph_embedding_model_layer - 1:
                    layers_feature_extractor_output = F.dropout(
                        layers_feature_extractor_output,
                        p=self.dropout,
                        training=self.training)

            x = readout_function(layers_feature_extractor_output, self.readout)

            spatial_graph_embedding_model_feature_extractor = torch.nn.Sequential(
                *list(spatial_graph_embedding_model.children())[:-1])
            spatial_graph_embedding_model_feature_extractor_output = F.relu(
                spatial_graph_embedding_model_feature_extractor(x))

        # Concat layer
        concat = torch.cat(
            (node_random_walk_model_feature_extractor_output,
             spatial_graph_embedding_model_feature_extractor_output), 1)

        # Linear combination ensemble layer
        x = F.dropout(concat, p=self.concat_dropout, training=self.training)

        if self.fc_layer_type == 'A':
            x = self.fc1(x)
            x = self.fc2(x)
        elif self.fc_layer_type == 'B':
            x = self.fc(x)

        return x
    # monitor training loss
    torch.cuda.empty_cache()
    train_loss = 0.0
    ###################
    # train the model #
    ###################

    for data, target in train_loader:

        if switch_ensemble is True:
            # print("Ensemble:", switch_ensemble)
            weight_fc1 = model.fc1.weight
            weight_fc2 = model.fc2.weight
#            print(weight)
#            print(F.dropout(weight, p=probability)*(1-probability))
            model.fc1.weight = torch.nn.Parameter(F.dropout(weight_fc1, p=probability)*(1-probability))
            model.fc2.weight = torch.nn.Parameter(F.dropout(weight_fc2, p=probability) * (1 - probability))
        # linear layer (n_hidden -> hidden_2)
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # forward pass: compute predicted outputs by passing inputs to the model
        output = model(data)
        if switch_ensemble is True:
            model.fc1.weight = torch.nn.Parameter(weight_fc1)
            model.fc2.weight = torch.nn.Parameter(weight_fc2)
        # calculate the loss
        loss = criterion(output, target)
        # backward pass: compute gradient of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step (parameter update)
        optimizer.step()
    def forward(self, prev_output_tokens, encoder_out_dict):
        encoder_out = encoder_out_dict['encoder']['encoder_out']
        trained_encoder_out = encoder_out_dict['pretrained'] if self.pretrained else None

        encoder_a, encoder_b = self._split_encoder_out(encoder_out)

        # embed positions
        positions = self.embed_positions(prev_output_tokens)

        # embed tokens and positions
        x = self.embed_tokens(prev_output_tokens) + positions
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x.transpose(0, 1)

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # temporal convolutions
        avg_attn_scores = None
        for proj, conv, attention, selfattention, attproj in zip(
            self.projections, self.convolutions, self.attention, self.selfattention, self.attproj
        ):
            residual = x if proj is None else proj(x)

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x)
            x = F.glu(x, dim=2)

            # attention
            if attention is not None:
                r = x
                x, attn_scores = attention(attproj(x) + target_embedding, encoder_a, encoder_b)
                x = x + r
                if not self.training and self.need_attn:
                    if avg_attn_scores is None:
                        avg_attn_scores = attn_scores
                    else:
                        avg_attn_scores.add_(attn_scores)

            if selfattention is not None:
                x = selfattention(x)

            x = (x + residual) * math.sqrt(0.5)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        # project back to size of vocabulary
        x = self.fc2(x)
        x = F.dropout(x, p=self.dropout, training=self.training)
        if not self.pretrained:
            x = self.fc3(x)

        # fusion gating
        if self.pretrained:
            trained_x, _ = self.pretrained_decoder.forward(prev_output_tokens, trained_encoder_out)
            y = torch.cat([x, self.pretrained_outputs["out"]], dim=-1)
            gate1 = self.gate1(y)
            gate2 = self.gate2(y)
            gated_x1 = gate1 * x
            gated_x2 = gate2 * self.pretrained_outputs["out"]
            fusion = torch.cat([gated_x1, gated_x2], dim=-1)
            fusion = self.joining(fusion)
            fusion_output = self.fc3(fusion)
            return fusion_output, avg_attn_scores
        else:
            return x, avg_attn_scores
Beispiel #56
0
    def extract_features(
        self,
        prev_output_tokens,
        encoder_out: Optional[EncoderOut] = None,
        incremental_state: Optional[Dict[str, Dict[str,
                                                   Optional[Tensor]]]] = None,
        full_context_alignment: bool = False,
        alignment_layer: Optional[int] = None,
        alignment_heads: Optional[int] = None,
    ):
        """
        Similar to *forward* but only return features.

        Includes several features from "Jointly Learning to Align and
        Translate with Transformer Models" (Garg et al., EMNLP 2019).

        Args:
            full_context_alignment (bool, optional): don't apply
                auto-regressive mask to self-attention (default: False).
            alignment_layer (int, optional): return mean alignment over
                heads at this layer (default: last layer).
            alignment_heads (int, optional): only average alignment over
                this many heads (default: all heads).

        Returns:
            tuple:
                - the decoder's features of shape `(batch, tgt_len, embed_dim)`
                - a dictionary with any model-specific outputs
        """
        if alignment_layer is None:
            alignment_layer = self.num_layers - 1

        # embed positions
        positions = (self.embed_positions(prev_output_tokens,
                                          incremental_state=incremental_state)
                     if self.embed_positions is not None else None)

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions

        if self.layernorm_embedding is not None:
            x = self.layernorm_embedding(x)

        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        self_attn_padding_mask: Optional[Tensor] = None
        if self.cross_self_attention or prev_output_tokens.eq(
                self.padding_idx).any():
            self_attn_padding_mask = prev_output_tokens.eq(self.padding_idx)

        # decoder layers
        attn: Optional[Tensor] = None
        inner_states: List[Optional[Tensor]] = [x]
        for idx, layer in enumerate(self.layers):
            encoder_state: Optional[Tensor] = None
            if encoder_out is not None:
                if self.layer_wise_attention:
                    encoder_states = encoder_out.encoder_states
                    assert encoder_states is not None
                    encoder_state = encoder_states[idx]
                else:
                    encoder_state = encoder_out.encoder_out

            if incremental_state is None and not full_context_alignment:
                self_attn_mask = self.buffered_future_mask(x)
            else:
                self_attn_mask = None

            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
            dropout_probability = torch.empty(1).uniform_()
            if not self.training or (dropout_probability >
                                     self.decoder_layerdrop):
                x, layer_attn, _ = layer(
                    x,
                    encoder_state,
                    encoder_out.encoder_padding_mask
                    if encoder_out is not None else None,
                    incremental_state,
                    self_attn_mask=self_attn_mask,
                    self_attn_padding_mask=self_attn_padding_mask,
                    need_attn=bool((idx == alignment_layer)),
                    need_head_weights=bool((idx == alignment_layer)),
                )
                inner_states.append(x)
                if layer_attn is not None and idx == alignment_layer:
                    attn = layer_attn.float()

        if attn is not None:
            if alignment_heads is not None:
                attn = attn[:alignment_heads]

            # average probabilities over heads
            attn = attn.mean(dim=0)

        if self.layer_norm is not None:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        return x, {"attn": [attn], "inner_states": inner_states}
 def forward(self, x):
     out = self.linear(x)
     out = F.dropout(out, training=self.training)
     return out
Beispiel #58
0
    def forward(self, prev_output_tokens, encoder_out=None, incremental_state=None):
        """
        Args:
            prev_output_tokens (LongTensor): previous decoder outputs of shape
                `(batch, tgt_len)`, for teacher forcing
            encoder_out (Tensor, optional): output from the encoder, used for
                encoder-side attention
            incremental_state (dict): dictionary used for storing state during
                :ref:`Incremental decoding`

        Returns:
            tuple:
                - the last decoder layer's output of shape `(batch, tgt_len,
                  vocab)`
                - the last decoder layer's attention weights of shape `(batch,
                  tgt_len, src_len)`
        """
        # embed positions
        positions = self.embed_positions(
            prev_output_tokens,
            incremental_state=incremental_state,
        ) if self.embed_positions is not None else None

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
            if positions is not None:
                positions = positions[:, -1:]

        # embed tokens and positions
        x = self.embed_scale * self.embed_tokens(prev_output_tokens)

        if self.project_in_dim is not None:
            x = self.project_in_dim(x)

        if positions is not None:
            x += positions
        x = F.dropout(x, p=self.dropout, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)
        attn = None

        inner_states = [x]

        # decoder layers
        for layer in self.layers:
            x, attn = layer(
                x,
                encoder_out['encoder_out'] if encoder_out is not None else None,
                encoder_out['encoder_padding_mask'] if encoder_out is not None else None,
                incremental_state,
            )
            inner_states.append(x)

        if self.normalize:
            x = self.layer_norm(x)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        if self.project_out_dim is not None:
            x = self.project_out_dim(x)

        if self.adaptive_softmax is None:
            # project back to size of vocabulary
            if self.share_input_output_embed:
                x = F.linear(x, self.embed_tokens.weight)
            else:
                x = F.linear(x, self.embed_out)

        return x, {'attn': attn, 'inner_states': inner_states}
 def forward(self, x):
     new_features = super(_DenseLayer, self).forward(x)
     if self.drop_rate > 0:
         new_features = F.dropout(new_features, p=self.drop_rate, training=self.training)
     return torch.cat([x, new_features], 1)
Beispiel #60
0
    def forward(self,
                prev_output_tokens,
                encoder_out=None,
                incremental_state=None,
                **unused):
        if encoder_out is not None:
            encoder_padding_mask = encoder_out['encoder_padding_mask']
            encoder_out = encoder_out['encoder_out']

            # split and transpose encoder outputs
            encoder_a, encoder_b = self._split_encoder_out(
                encoder_out, incremental_state)

        if self.embed_positions is not None:
            pos_embed = self.embed_positions(prev_output_tokens,
                                             incremental_state)
        else:
            pos_embed = 0

        if incremental_state is not None:
            prev_output_tokens = prev_output_tokens[:, -1:]
        x = self._embed_tokens(prev_output_tokens, incremental_state)

        # embed tokens and combine with positional embeddings
        x += pos_embed
        x = F.dropout(x, p=self.dropout, training=self.training)
        target_embedding = x

        # project to size of convolution
        x = self.fc1(x)

        # B x T x C -> T x B x C
        x = self._transpose_if_training(x, incremental_state)

        # temporal convolutions
        avg_attn_scores = None
        num_attn_layers = len(self.attention)
        residuals = [x]
        for proj, conv, attention, res_layer in zip(self.projections,
                                                    self.convolutions,
                                                    self.attention,
                                                    self.residuals):
            if res_layer > 0:
                residual = residuals[-res_layer]
                residual = residual if proj is None else proj(residual)
            else:
                residual = None

            x = F.dropout(x, p=self.dropout, training=self.training)
            x = conv(x, incremental_state)
            x = F.glu(x, dim=2)

            # attention
            if attention is not None:
                x = self._transpose_if_training(x, incremental_state)

                x, attn_scores = attention(x, target_embedding,
                                           (encoder_a, encoder_b),
                                           encoder_padding_mask)

                if not self.training and self.need_attn:
                    attn_scores = attn_scores / num_attn_layers
                    if avg_attn_scores is None:
                        avg_attn_scores = attn_scores
                    else:
                        avg_attn_scores.add_(attn_scores)

                x = self._transpose_if_training(x, incremental_state)

            # residual
            if residual is not None:
                x = (x + residual) * math.sqrt(0.5)
            residuals.append(x)

        # T x B x C -> B x T x C
        x = self._transpose_if_training(x, incremental_state)

        # project back to size of vocabulary if not using adaptive softmax
        if self.fc2 is not None and self.fc3 is not None:
            x = self.fc2(x)
            x = F.dropout(x, p=self.dropout, training=self.training)
            x = self.fc3(x)

        return x, avg_attn_scores