Python get_incremental_state Examples

Programming Language: Python

Namespace/Package Name: seq2seq.utils

Method/Function: get_incremental_state

Examples at hotexamples.com: 14

Python get_incremental_state - 14 examples found. These are the top rated real world Python examples of seq2seq.utils.get_incremental_state extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

 def _get_input_buffer(self, incremental_state):
     return utils.get_incremental_state(self, incremental_state,
                                        'attn_state') or {}

Example #2

Show file

File: lstm.py Project: s1879281/Lexical-NMT

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)

        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size).cuda()
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size).cuda()
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)

            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # Compute and collect LEXICAL MODEL context vectors here
                    lexical_context = torch.tanh(
                        torch.bmm(step_attn_weights.unsqueeze(dim=1),
                                  src_embeddings.transpose(0,
                                                           1)).squeeze(dim=1))
                    lexical_contexts.append(
                        torch.tanh(
                            self.lexical_context_projection(lexical_context)) +
                        lexical_context)

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # Incorporate the LEXICAL MODEL into the prediction of target tokens
            lexical_contexts = torch.cat(lexical_contexts,
                                         dim=0).view(tgt_time_steps,
                                                     batch_size,
                                                     self.embed_dim)
            lexical_contexts = lexical_contexts.transpose(0, 1)
            decoder_output += self.final_lexical_projection(lexical_contexts)

        return decoder_output, attn_weights

Example #3

Show file

File: lstm.py Project: syslot/nmt_toolkit_complete

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous 
            target state as one of its inputs? What is the purpose of the dropout layer?
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    step_attn_weights = step_attn_weights.permute(
                        1, 0).unsqueeze(dim=2)
                    # Compute lexical context
                    weighted_lexical_mean = torch.tanh(
                        step_attn_weights * src_embeddings).sum(dim=0)
                    # Feed lexical context through a one-layer FNN with skip connections
                    lexical_contexts.append(
                        torch.tanh(
                            self.stepwise_lexical_projection(
                                weighted_lexical_mean)) +
                        weighted_lexical_mean)
                    # TODO: --------------------------------------------------------------------- /CUT

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            # TODO: --------------------------------------------------------------------- CUT
            # Collect outputs across time steps
            lexical_module_output = \
                torch.cat(lexical_contexts, dim=0).view(tgt_time_steps, batch_size, self.embed_dim)
            lexical_module_output = lexical_module_output.transpose(0, 1)
            # Combine decoder output with the lexical module output
            lexical_module_output = self.final_lexical_projection(
                lexical_module_output)
            decoder_output += lexical_module_output
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #4

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        If the cache state is not None, the hidden states of target(tgt_hidden_state), the cell state of target(tgt_cell_state) and
        input feeding(input_feed) could be obtained from the cache.
        If the cache state is None, the decoder state will be initialized. 
        The hidden state and cell state of target will be initialized as the tensor with value 0.
        When incremental, auto-regressive generation is not used, the cached_state will be set as None.
        The input feeding will be initialized as embedding which if filled with zero and  same shape as the taget input embbeding.

        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous 
            target state as one of its inputs? What is the purpose of the dropout layer?
            
            The context vector is calculated by the all source state and the global align weights.
            The concatentation of the context vector and current target state will multiply with weight matrix.
            And the result is applied to the tanh function.//attentional vector (use formula to explain)
            In this attentional NMT model, the previous alignment infomation will also be considered.
            If we do not include the previous infomation about the target state as input, the model will make attentional decisions independently.
            And it will be suboptimal.
            The purpose of the dropout layer is to alleviate the overfitting of model.
            The drop layer is the effective method for regularization.
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    src_embeddings = src_embeddings.transpose(0, 1)
                    current_context = torch.tanh(
                        torch.bmm(step_attn_weights.unsqueeze(dim=1),
                                  src_embeddings))
                    src_embeddings = src_embeddings.transpose(0, 1)
                    lexical_contexts.append(current_context)
                    # TODO: --------------------------------------------------------------------- /CUT

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            lexical_model_output = torch.cat(lexical_contexts, dim=0).view(
                tgt_time_steps, batch_size, self.embed_dim)
            lexical_model_layer_output = torch.tanh(
                self.lexical_model_layer(
                    lexical_model_output)) + lexical_model_output
            combing_lexical_model_layer = self.combine_lexical_and_hidden_layer(
                lexical_model_layer_output)
            decoder_output = decoder_output + combing_lexical_model_layer.transpose(
                0, 1)
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #5

Show file

File: lstm.py Project: ribanez/machine-translation

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        src_out, src_hiddens, src_cells = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        srclen = src_out.size(0)

        # Embed tokens and apply dropout
        bsz, seqlen = tgt_inputs.size()
        x = self.embedding(tgt_inputs)
        x = F.dropout(x, p=self.dropout_in, training=self.training)

        # B x T x C -> T x B x C
        x = x.transpose(0, 1)

        # Initialize previous states (or get from cache during incremental generation)
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            rnn_hiddens, rnn_cells, input_feed = cached_state
        else:
            # Initialize RNN cells with those from encoder
            rnn_hiddens = [src_hiddens[i] for i in range(len(self.layers))]
            rnn_cells = [src_cells[i] for i in range(len(self.layers))]
            input_feed = x.data.new(bsz, self.hidden_size).zero_()

        attn_scores = x.data.new(srclen, seqlen, bsz).zero_()
        rnn_outputs = []

        for j in range(seqlen):
            # Concatenate token embedding with output from previous time step
            input = torch.cat([x[j, :, :], input_feed], dim=1)

            for i, rnn in enumerate(self.layers):
                # Apply recurrent cell
                rnn_hiddens[i], rnn_cells[i] = rnn(
                    input, (rnn_hiddens[i], rnn_cells[i]))

                # Hidden state becomes the input to the next layer
                input = F.dropout(rnn_hiddens[i],
                                  p=self.dropout_out,
                                  training=self.training)

            # Prepare input feed for next time step
            if self.attention is None:
                input_feed = rnn_hiddens[-1]
            else:
                input_feed, attn_scores[:, j, :] = self.attention(
                    rnn_hiddens[-1], src_out, src_mask)
            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)

        # cache previous states (no-op except during incremental generation)
        utils.set_incremental_state(self, incremental_state, 'cached_state',
                                    (rnn_hiddens, rnn_cells, input_feed))

        # Collect outputs across time steps
        x = torch.cat(rnn_outputs, dim=0).view(seqlen, bsz, self.hidden_size)

        # T x B x C -> B x T x C
        x = x.transpose(0, 1)

        # srclen x seqlen x bsz -> bsz x tgtlen x srclen
        attn_scores = attn_scores.transpose(0, 2)

        # Final projection
        x = self.final_proj(x)
        return x, attn_scores

Example #6

Show file

File: lstm.py Project: HrBlack/NMT_Pytorch

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out[
            'src_embeddings']  # dimension: [src_time_steps, batch_size, num_features]
        src_embeddings = src_embeddings.transpose(
            1, 0)  # dimension: [batch_size, src_time_steps, num_features]

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        To initialize the decoder state, the decoder construct a matrix full of 0 with the dimension: [batch, hidden_size].
        The cached_state would be None when the model mode is set to be 'train', and when the model mode is 'evaluation'
        without beam search. Under this circumstance, the model won't cache any previous state, and then the decoder state
        would be initialized.
        Input_feed is the output of attention layer, which is non-linear transformation of concatenation of previous 
        context vector and decoder hidden state. It can make the current decoder hidden state attend to the source inputs.
        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            Once the attention vector is calculated, it's concatenated with decoder hidden states 
            'tat_hidden_states[-1]', then their combination will go through a non-linear layer with tanh activation and
            produce the decoder output; Another place where attention vector integrates with docoder is that to concatenate
            it with previous decoder hidden state, then combine it with the computation of input of decoer after applying tanh activation.
            
            The reason why attention function takes the previous target state as input is that, at different step, the
            model is supposed to attend to different part of source side information. Hence, the target state is needed 
            to capture specific features for each target output.
            The dropout layer is used to reduce the number of parameters during training randomly. It can relieve the 
            dependency of the model on some specific parameter, and it can avoid overfitting efficiently. Thus, it will 
            improve the model's generalization ability.        
            
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # Compute and collect LEXICAL MODEL context vectors here

                    if src_mask is not None:
                        src_mask = src_mask.unsqueeze(
                            dim=1
                        )  # insert one another dimension at dim=1:[src_time_steps, 1, batch]
                        # attn_scores.masked_fill_(src_mask, float('-inf'))
                    # attn_weights = F.softmax(attn_scores, dim=-1)
                    attn_context = torch.tanh(
                        torch.bmm(attn_weights, src_embeddings).squeeze(dim=1))
                    attn_out = torch.tanh(
                        self.ffnn_projection(attn_context)) + attn_context
                    # print(attn_out.size())
            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # Incorporate the LEXICAL MODEL into the prediction of target tokens here
            # decoder_output = torch.cat(rnn_outputs, dim=0).view(tgt_time_steps, batch_size, self.hidden_size)
            decoder_output += self.lexical_projection(attn_out)

        return decoder_output, attn_weights

Example #7

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
       ___QUESTION-1-DESCRIBE-D-START___
       Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
       '''
        '''
           The Decoder state is initialized by testing for the cached_state. If there is a cached_state then the decoder state
       is assigned with cached_state. If there is no cached_state then the decoder_state creates zero vectors for the
       hidden_states, cell_states and the input_feed.
           The value of cached_state is none when incremental, auto-regressive generation is not used or the system is on the
       first word in a sentence.
           input_feed is the output of vector from the attention layer vectors and passes within the LSTM, context information
       from the previous time step.
       '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size).cuda()
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size).cuda()
                for i in range(len(self.layers))
            ]
            #tgt_hidden_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            #tgt_cell_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
           ___QUESTION-1-DESCRIBE-E-START___
           How is attention integrated into the decoder? Why is the attention function given the previous
           target state as one of its inputs? What is the purpose of the dropout layer?
           '''
            '''
               Attention is integrated into the decoder through self.attention which stores the AttentionLayer if use_attention is set to true
           The Attention states are upadted through timesteps and are stores in step_attn_weights   
               The attention function is given the previous target state as one of its inputs because enables information to pass
           from the previous alignment. This is then used to obtain the correct alignment for the current word.
               The dropout layer prevents overfitting and helps improve the performance of the model. This is because the inputs
           and recurrent connections to LSTM units are excluded probablisticaly, during the training process.
           '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    batch_size, step_length = step_attn_weights.size()
                    src_embeddings_transpose = src_embeddings.transpose(0, 1)
                    step_attention_reshaped = step_attn_weights.view(
                        batch_size, 1, step_length)
                    f_t = F.tanh(
                        torch.bmm(step_attention_reshaped,
                                  src_embeddings_transpose))
                    h_t = F.tanh(self.layer_1(f_t)) + f_t
                    y_t = self.transform_1(h_t) + self.transform_2(
                        input_feed.view(batch_size, 1, -1))
                    lexical_contexts.append(y_t)
                    # TODO: --------------------------------------------------------------------- /CUT
            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            decoder_output = torch.cat(lexical_contexts,
                                       dim=0).view(batch_size, tgt_time_steps,
                                                   -1)
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #8

Show file

File: lstm.py Project: keanuk/NLU_CW2

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings, p=self.dropout_in, training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        '''
        '''
        The decoder state is initialized as a series of zero vectors if there is no cached state. If there is a cached state, the decoder state is simply set to that previous state.
        cached_state == None when not using incremental, auto-regressive generation or when there is no cached state to pick up from.
        input_feed is the output attentional vectors from the previous time step, and it is fed into the LSTM to pass along information about the previous attention.
        '''
        cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            tgt_cell_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            input_feed = tgt_embeddings.data.new(batch_size, self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps, src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []


        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed], dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id], p=self.dropout_out, training=self.training)

            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous
            target state as one of its inputs? What is the purpose of the dropout layer?
            '''
            '''
            Attention is integrated into the decoder as an input, this is represented by input_feed in the code. 
            The attention function is given the previous target state as an input because that state will be propagated across the input sequence to retain information from each input. The combination of data from all target states allows the proper attention to be calculated.
            The dropout layer prevents high amounts of correlation between the different hidden units (co-adaptation). If not corrected for, this would lead to computational redundance and overfitting.
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    pass
                    # TODO: --------------------------------------------------------------------- /CUT

                    f_t = torch.tanh(torch.bmm(torch.unsqueeze(step_attn_weights, 0), torch.transpose(src_embeddings, 0, 1)))
                    lexical_contexts.append(f_t)


            input_feed = F.dropout(input_feed, p=self.dropout_out, training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state', (tgt_hidden_states, tgt_cell_states, input_feed))

        # print("\n**************\nRNN output size: ", len(rnn_outputs))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs, dim=0).view(tgt_time_steps, batch_size, self.hidden_size)

        # print("\n**************\nDecoder 1 output size: ", decoder_output.size())

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # print("\n**************\nDecoder 2 output size: ", decoder_output.size())

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        # print("\n**************\nFinal decoder output size: ", decoder_output.size())

        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here

            # TODO: --------------------------------------------------------------------- /CUT

            lc = torch.cat(lexical_contexts, dim=1)
            h_t = torch.tanh(self.W_f(lc)) + lc
            decoder_output = decoder_output + self.W_l(h_t)

        return decoder_output, attn_weights

Example #9

Show file

File: lstm.py Project: JacobADyerGarr/nlu_cw2

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings, p=self.dropout_in, training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        
        The decoder state is initialized by either generating the previous states, or retrieving a cached state.
        
        cached_state == None when if there is no previous state for the instance of the module. This would usually only
        happen when the decoder is first being run, as that is when the first forward pass occurs and prior to that, no
        state could possibly be cached.
        
        input_feed is to store the decoders predictions <- TODO - Check
        '''
        cached_state = utils.get_incremental_state(self, incremental_state, 'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            tgt_cell_states = [torch.zeros(tgt_inputs.size()[0], self.hidden_size) for i in range(len(self.layers))]
            input_feed = tgt_embeddings.data.new(batch_size, self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps, src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed], dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id], p=self.dropout_out, training=self.training)

            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous 
            target state as one of its inputs? What is the purpose of the dropout layer?
            
            Attention is integrated as an additional layer at the end of the decoders forward pass.
            
            The attention function is given the previous target state alongside the decoder output in order to be used
             to calculate the alignment scores.
            
            The dropout layer is added in order to reduce the chance of the model over-fitting during training.
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION-5: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    curr_context = torch.tanh(torch.bmm(step_attn_weights.unsqueeze(dim=1),
                                                        src_embeddings.transpose(0, 1).squeeze(dim=1)))
                    lexical_contexts.append(torch.tanh(self.lex_context_projection_layer(curr_context)) + curr_context)
                    # TODO: --------------------------------------------------------------------- /CUT

            input_feed = F.dropout(input_feed, p=self.dropout_out, training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state', (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs, dim=0).view(tgt_time_steps, batch_size, self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION-5: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            # TODO: --------------------------------------------------------------------- CUT
            lexical_contexts = torch.cat(lexical_contexts, dim=0).view(tgt_time_steps, batch_size, self.embed_dim)
            lexical_contexts = lexical_contexts.transpose(0, 1)
            decoder_output += self.final_lex_projection(lexical_contexts)
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #10

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __LEXICAL: Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        if self.is_cuda:
            tgt_inputs = utils.move_to_cuda(tgt_inputs)
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            if self.is_cuda:
                tgt_hidden_states = utils.move_to_cuda(tgt_hidden_states)
                tgt_cell_states = utils.move_to_cuda(tgt_cell_states)
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __LEXICAL: Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = torch.empty(0)
        lexical_tensor = torch.empty(0)

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)

            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __LEXICAL: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT

                    for i in range(src_time_steps):
                        attn_weights_sub = attn_weights[0, j, i].clone()
                        src_embeddings_sub = src_embeddings[i, :, :].clone()

                        lexical_feed = torch.mul(src_embeddings_sub,
                                                 attn_weights_sub)

                        if i == 0:
                            lexical_contexts = lexical_feed
                        else:
                            lexical_contexts_clone = lexical_contexts
                            lexical_contexts = lexical_feed + lexical_contexts_clone

                    lexical_vec = self.trans_tanh(lexical_contexts)
                    print('lexical_contexts', j, lexical_vec)
                    lexical_transed = self.lexical_projection(lexical_vec)
                    print('lexical_transed', lexical_transed,
                          lexical_transed.size())
                    # TODO: --------------------------------------------------------------------- /CUT

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)

            lexical_tensor = torch.cat(
                (lexical_tensor.clone(), lexical_transed), 0)
            print('lexical_tensor', lexical_tensor, lexical_tensor.size())

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        print('decoder_output', decoder_output, decoder_output.size())
        # Final projection

        decoder_output = self.final_projection(decoder_output)  ##

        if self.use_lexical_model:
            # __LEXICAL: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            goodoutput = lexical_tensor.clone()
            decoder_output = self.final_projection(goodoutput) + decoder_output
            pass
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #11

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        
        
        The decoder state is initialized as a tensor of zeros of the size:
            batch_size * size_hidden_layer.

            cached_state==None when incremental_state ==None or when the cached state of the 
            current module does not exist in the current incremental_state.

            input_feed stores either the previous hidden decoder state or when attention is used, 
            the attention context vector. And it is initialized to zeros at the first decoding time 
            step.
        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            # yichao: enable cuda
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0],
                            self.hidden_size).to(self.device)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0],
                            self.hidden_size).to(self.device)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(
                batch_size, self.hidden_size).zero_().to(self.device)
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []
        # this will make the src_embeddings shape (batch_size, src_time_steps, src_embedding_size)
        src_embeddings = src_embeddings.transpose(0, 1)
        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous 
            target state as one of its inputs? What is the purpose of the dropout layer?
            
            Attention is integrated into the decoder when the self.attention attribute is set
             true; the attention context vector will be computed using the AttentionLayer
              Module defined above, taking as input the last decoder hidden states, all encoder 
              outputs, and encoder input masks as described in Comment B.

            It takes in the previous target state to compute the attention scores.

            Dropout is used for regularization and to prevent the weights' co-adaptation
            
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION-5: Compute and collect LEXICAL MODEL context vectors here
                    # TODO: --------------------------------------------------------------------- CUT
                    # this will make the step_attn_weights of shape (batch_size, 1, src_time_steps)
                    step_attn_weights = step_attn_weights.unsqueeze(dim=1)
                    # the resulting lexical scores will have the shape (batch_size, embedding_size)
                    # since they are dependent on a single time step
                    lexical_scores = torch.bmm(step_attn_weights,
                                               src_embeddings)
                    lexical_scores = torch.tanh(lexical_scores).squeeze(dim=1)
                    # do first projection
                    lexical_contexts.append(
                        torch.tanh(
                            self.lexical_intermediate_projection(
                                lexical_scores)) + lexical_scores)
                    # TODO: --------------------------------------------------------------------- /CUT

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION-5: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            # TODO: --------------------------------------------------------------------- CUT
            #
            lexical_contexts = torch.cat(lexical_contexts,
                                         dim=0).view(tgt_time_steps,
                                                     batch_size, -1)
            lexical_contexts = lexical_contexts.transpose(0, 1)
            lexical_contexts = self.lexical_final_projection(lexical_contexts)
            # the shape of lexical contexts will be (batch_size, tgt_time_steps, vocabulary_size)
            # print(decoder_output.size())
            # print(lexical_contexts.size())
            decoder_output += lexical_contexts
            # TODO: --------------------------------------------------------------------- /CUT

        return decoder_output, attn_weights

Example #12

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        ANSWER D: 
        The decoder is incrementally initialized, meaning that it receives a previous output and produces the next output. 
        The decoder states ie the trget hidden states, the trget cell states and the input feed are initialised to zeros when no cache exists. 
        If the cache exits, the target cell and hidden states are initialised to the previous hidden and cell states. 
        The input feed is initialsed to the input feed from the previous step. The previous step input feed corrresponds to 
        the dropped out version final target hidden state if not using attention or the dropped out version of the attenion outputs 
        if attention is applied (Assuming dropout is used). The model caches any long-term history needed for the translation of the sequence. 
        However, cached_state is activated after the first word of the sequence, as only then the model is able to start storing representations, 
        when there is an available previous hidden state. Input feed refers to the approach of feeding the current token embedding to the next time step
        in order to inform the model about alignments of the past.

        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous 
            target state as one of its inputs? What is the purpose of the dropout layer?

            ANSWER E: 
            When attention is enabled, the decoder uses the attentional hidden state computed by the attention layer to 
            predict the next word in the translation sequence. 
            When there is no attention, the decoder considers only the last hidden state of the encoder.
            The current target hidden state is one of the inputs to the attention function as it is used 
            to compute the attention scores by comparing it to the source hidden state. Once the context vector is generated, 
            the current target hidden state is also concatenated with the context vector to produce the attentional hidden state.
            Dropout:
            Large neural networks, especially the ones using a small data-set for training, suffer from the problem of over-fitting 
            thus increasing the generalization error. Dropout layer is added as a form of regularization in order to avoid over-fitting.
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]

            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)

                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here

                    step_lex_context = F.tanh(
                        sum(
                            torch.bmm(
                                step_attn_weights.transpose(
                                    0, 1).unsqueeze(dim=1),
                                src_embeddings).squeeze(dim=1)))

                    lexical_contexts.append(step_lex_context)

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here
            lex_hid_out = torch.cat(lexical_contexts,
                                    dim=0).view(tgt_time_steps, self.embed_dim)

            lex_out = F.tanh(self.lexical_hidden(lex_hid_out)) + lex_hid_out

            decoder_output = decoder_output + self.lex_final(lex_out)

        return decoder_output, attn_weights

Example #13

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?

        Initializes hidden and cell states with zeros of the size (target\_input, hidden\_size).
        Cached state is none when incremental state is not set. Incremental state is mostly used when the
        model translates as the input is being typed, instead of taking the entire sentence as input at once.
        Input feed's role is to take into consideration previous alignment decisions.
        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous
            target state as one of its inputs? What is the purpose of the dropout layer?

            Attention is one of the input for the decoder to calculate its final output by concatenating all
            attention weights from all time steps. Since it is sequential we need the previous hidden state to
            calculate the current one. Dropout randomizes the use of specific neurons in the training to avoid
            over-fitting by forcing the network to use different neurons at a time.

            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    # __QUESTION: Compute and collect LEXICAL MODEL context vectors here

                    attn_transpose = step_attn_weights.transpose(1,
                                                                 0)  # [15,1]
                    attn_transpose = attn_transpose.unsqueeze(2)  # [15,1,1]
                    # src_embeddings [15, 1, 64]
                    af = torch.bmm(attn_transpose,
                                   src_embeddings)  # [15, 1, 64]
                    sumAf = torch.sum(af, dim=0)  # [1, 64]
                    f_t = torch.tanh(sumAf)  # [1, 64]
                    h_t = torch.tanh(self.lexical_ffnn(f_t)) + f_t  # [1,64]

                    lexical_contexts.append(h_t)

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # rnn_outputs[0].shape  [1,128]
        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs, dim=0).view(
            tgt_time_steps, batch_size, self.hidden_size)  # [x++, 1, 128]

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)  # [1, x++, 128]

        # Final projection
        decoder_output = self.final_projection(
            decoder_output)  #[ 1, x++, 3712]
        #    print ("DECODER OUTPUT")
        #    print (decoder_output)
        if self.use_lexical_model:
            # __QUESTION: Incorporate the LEXICAL MODEL into the prediction of target tokens here

            lexical_output_cat = torch.cat(lexical_contexts, dim=0).view(
                tgt_time_steps, batch_size, self.embed_dim)  # [x++, 1, 64]
            lexical_output = lexical_output_cat.transpose(0, 1)  # [1, x++, 64]
            lexical_proj = self.lexical_projection(lexical_output)

            decoder_output = decoder_output + lexical_proj
        return decoder_output, attn_weights

Example #14

Show file

    def forward(self, tgt_inputs, encoder_out, incremental_state=None):
        """ Performs the forward pass through the instantiated model. """
        # Optionally, feed decoder input token-by-token
        if incremental_state is not None:
            tgt_inputs = tgt_inputs[:, -1:]

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Recover encoder input
        src_embeddings = encoder_out['src_embeddings']

        src_out, src_hidden_states, src_cell_states = encoder_out['src_out']
        src_mask = encoder_out['src_mask']
        src_time_steps = src_out.size(0)

        # Embed target tokens and apply dropout
        batch_size, tgt_time_steps = tgt_inputs.size()
        tgt_embeddings = self.embedding(tgt_inputs)
        tgt_embeddings = F.dropout(tgt_embeddings,
                                   p=self.dropout_in,
                                   training=self.training)

        # Transpose batch: [batch_size, tgt_time_steps, num_features] -> [tgt_time_steps, batch_size, num_features]
        tgt_embeddings = tgt_embeddings.transpose(0, 1)

        # Initialize previous states (or retrieve from cache during incremental generation)
        '''
        ___QUESTION-1-DESCRIBE-D-START___
        Describe how the decoder state is initialized. When is cached_state == None? What role does input_feed play?
        '''
        cached_state = utils.get_incremental_state(self, incremental_state,
                                                   'cached_state')
        if cached_state is not None:
            tgt_hidden_states, tgt_cell_states, input_feed = cached_state
        else:
            tgt_hidden_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            tgt_cell_states = [
                torch.zeros(tgt_inputs.size()[0], self.hidden_size)
                for i in range(len(self.layers))
            ]
            input_feed = tgt_embeddings.data.new(batch_size,
                                                 self.hidden_size).zero_()
        '''___QUESTION-1-DESCRIBE-D-END___'''

        # Initialize attention output node
        attn_weights = tgt_embeddings.data.new(batch_size, tgt_time_steps,
                                               src_time_steps).zero_()
        rnn_outputs = []

        # __QUESTION-5 : Following code is to assist with the LEXICAL MODEL implementation
        # Cache lexical context vectors per translation time-step
        lexical_contexts = []

        for j in range(tgt_time_steps):
            # Concatenate the current token embedding with output from previous time step (i.e. 'input feeding')
            lstm_input = torch.cat([tgt_embeddings[j, :, :], input_feed],
                                   dim=1)

            for layer_id, rnn_layer in enumerate(self.layers):
                # Pass target input through the recurrent layer(s)
                tgt_hidden_states[layer_id], tgt_cell_states[layer_id] = \
                    rnn_layer(lstm_input, (tgt_hidden_states[layer_id], tgt_cell_states[layer_id]))

                # Current hidden state becomes input to the subsequent layer; apply dropout
                lstm_input = F.dropout(tgt_hidden_states[layer_id],
                                       p=self.dropout_out,
                                       training=self.training)
            '''
            ___QUESTION-1-DESCRIBE-E-START___
            How is attention integrated into the decoder? Why is the attention function given the previous
            target state as one of its inputs? What is the purpose of the dropout layer?
            '''
            if self.attention is None:
                input_feed = tgt_hidden_states[-1]
            else:
                input_feed, step_attn_weights = self.attention(
                    tgt_hidden_states[-1], src_out, src_mask)
                attn_weights[:, j, :] = step_attn_weights

                if self.use_lexical_model:
                    #Step attention weights of dimension (Batch size, Src time steps) -> (src time steps, batch size, 1)
                    #Src embedding of dimension (Src time steps, batch size, embedding dim)

                    #Weighted average = sum(attention_at_time(t) * word_at_time(t))
                    #so weighted average has dim (batchsize, embedding dimension)
                    weighted_average = torch.sum(
                        step_attn_weights.transpose(0, 1).unsqueeze(2) *
                        src_embeddings,
                        axis=0)
                    lexical_contexts.append(weighted_average)

            input_feed = F.dropout(input_feed,
                                   p=self.dropout_out,
                                   training=self.training)
            rnn_outputs.append(input_feed)
            '''___QUESTION-1-DESCRIBE-E-END___'''

        # Cache previous states (only used during incremental, auto-regressive generation)
        utils.set_incremental_state(
            self, incremental_state, 'cached_state',
            (tgt_hidden_states, tgt_cell_states, input_feed))

        # Collect outputs across time steps
        decoder_output = torch.cat(rnn_outputs,
                                   dim=0).view(tgt_time_steps, batch_size,
                                               self.hidden_size)

        # Transpose batch back: [tgt_time_steps, batch_size, num_features] -> [batch_size, tgt_time_steps, num_features]
        decoder_output = decoder_output.transpose(0, 1)

        # Final projection
        decoder_output = self.final_projection(decoder_output)

        if self.use_lexical_model:
            #List of tensors of dimension (batchsize, embedding) with output_length elements
            #Stacking gives a tensor with (output_length elements, batchsize, embedding) -> (batchsize, output_length, embedding)
            lexical_contexts = torch.stack(lexical_contexts).transpose(0, 1)
            proj_contexts = torch.tanh(
                self.lexical_projection_1(lexical_contexts)) + lexical_contexts
            proj_contexts = self.lexical_projection_2(proj_contexts)
            decoder_output += proj_contexts

        return decoder_output, attn_weights