class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size + self.hidden_size,
                                   self.hidden_size)
        self.h_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)  #possibly wrong
        self.c_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)  #possibly wrongc
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(self.hidden_size * 3,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)
        ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        X = self.model_embeddings.source(source_padded)
        packed = nn.utils.rnn.pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(packed)
        enc_hiddens, _ = nn.utils.rnn.pad_packed_sequence(enc_hiddens,
                                                          batch_first=True)
        #print(last_hidden.shape, torch.cat((last_hidden[0], last_hidden[1]), 1).shape)
        init_decoder_hidden = self.h_projection(
            torch.cat((last_hidden[0], last_hidden[1]), 1))
        init_decoder_cell = self.c_projection(
            torch.cat((last_cell[0], last_cell[1]), 1))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop off the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack

        #print(enc_hiddens.shape)
        enc_hiddens_proj = self.att_projection(enc_hiddens)
        #print(enc_hiddens_proj.shape)
        Y = self.model_embeddings.target(target_padded)
        Y_split = torch.split(Y, 1, dim=0)
        for Y_t in Y_split:
            Y_t = torch.squeeze(Y_t, 0)
            Ybar_t = torch.cat((Y_t, o_prev), 1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens,
                                          enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t
            #print(Y_t.shape, o_prev.shape, Ybar_t.shape)
        combined_outputs = torch.stack(combined_outputs)
        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        #print(enc_hiddens_proj.shape, torch.unsqueeze(dec_hidden, 2).shape)
        e_t = torch.squeeze(
            torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2)), 2)
        #print(e_t.shape)
        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        # $$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
        alpha_t = nn.Softmax(dim=1)(e_t)
        a_t = torch.squeeze(
            torch.bmm(torch.unsqueeze(alpha_t, 1), enc_hiddens), 1)
        #print(dec_hidden.shape, a_t.shape)
        U_t = torch.cat((a_t, dec_hidden), 1)
        v_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(v_t))
        #print(O_t.shape)
        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Beispiel #2
0
class QGModel(nn.Module):
    def __init__(self,
                 vocab,
                 embed_size,
                 hidden_size,
                 enc_bidir,
                 attn_size,
                 dropout=0.2):
        super(QGModel, self).__init__()
        self.vocab = vocab
        self.args = {
            'embed_size': embed_size,
            'hidden_size': hidden_size,
            'dropout': dropout,
            'enc_bidir': enc_bidir,
            'attn_size': attn_size
        }
        self.embeddings = ModelEmbeddings(embed_size, vocab)
        self.encoder = Encoder(embed_size, hidden_size, dropout, enc_bidir)
        self.decoder_init_hidden_proj = nn.Linear(self.encoder.hidden_size,
                                                  hidden_size)
        self.decoder = Decoder(embed_size, hidden_size, attn_size,
                               len(vocab.tgt), dropout)

    def batch_to_tensor(self, source, target):
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)
        source_mask = self.generate_mask(source_lengths,
                                         source_padded.shape[0])
        return source_padded, target_padded, source_lengths, source_mask

    def forward(self, source: List[List[str]], target: List[List[str]]):
        source_padded, target_padded, source_lengths, source_mask = self.batch_to_tensor(
            source, target)

        source_embedding = self.embeddings.source(
            source_padded)  # (src_len, b, embed_size)
        target_embedding = self.embeddings.target(
            target_padded)  # (tgt_len, B, embed_size)
        memory, last_hidden = self.encoder(source_embedding, source_lengths)
        # last_hidden: (B, hidden)
        memory = memory.transpose(0, 1)  # memory: (B, src_len, hidden)
        dec_init_hidden = torch.tanh(
            self.decoder_init_hidden_proj(last_hidden))
        gen_output = self.decoder(memory, source_mask, target_embedding,
                                  dec_init_hidden)
        # (tgt_len - 1, B, word_vocab_size), not probability
        P = F.log_softmax(gen_output, dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def generate_mask(self, length, max_length):
        mask = torch.zeros(len(length),
                           max_length,
                           dtype=torch.int,
                           device=self.device)
        for i, x in enumerate(length):
            mask[i, x:] = 1
        return mask

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70):
        """
        :param batch: batch size is 1
        :param beam_size:
        :return:
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)
        src_len = torch.tensor([len(src_sent)],
                               dtype=torch.int,
                               device=self.device)
        source_embedding = self.embeddings.source(
            src_sents_var)  # (src_len, b, embed_size)

        memory, last_hidden = self.encoder(source_embedding, src_len)
        # last_hidden: (B, hidden)
        memory = memory.transpose(0, 1)  # memory: (B, src_len, hidden)
        dec_init_hidden = torch.tanh(
            self.decoder_init_hidden_proj(last_hidden))  # (B, hidden)
        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []
        t = 0
        ctxt_tm1 = torch.zeros(len(hypotheses),
                               self.args['hidden_size'],
                               device=self.device)
        dec_hidden_tm1 = dec_init_hidden
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)
            prev_word = torch.tensor(
                [self.vocab.tgt[x[-1]] for x in hypotheses],
                dtype=torch.long,
                device=self.device)
            tgt_tm1 = self.embeddings.target(prev_word)  # (B, word_embed_size)

            memory_tm1 = memory.expand((hyp_num, *memory.shape[1:]))
            gen_t, dec_hidden_t, ctxt_t = self.decoder.decode_step(
                tgt_tm1, ctxt_tm1, dec_hidden_tm1, memory_tm1)
            gen_t = torch.log_softmax(gen_t, dim=-1)  # (B, vocab)
            live_hyp_num = beam_size - len(completed_hypotheses)
            continuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(gen_t) + gen_t).view(
                    -1)  # (hyp_num * V)
            top_candi_scores, top_candi_position = torch.topk(
                continuating_hyp_scores, k=live_hyp_num)
            prev_hyp_indexes = top_candi_position / len(self.vocab.tgt)
            hyp_word_indexes = top_candi_position % len(self.vocab.tgt)

            new_hypothesis = []
            live_hyp_index = []
            new_hyp_scores = []
            num_unk = 0
            for prev_hyp_index, hyp_word_index, new_hyp_score in zip(
                    prev_hyp_indexes, hyp_word_indexes, top_candi_scores):
                prev_hyp_index = prev_hyp_index.item()
                hyp_word_index = hyp_word_index.item()
                new_hyp_score = new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_index]
                new_hypo = hypotheses[prev_hyp_index] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hypo[1:-1], score=new_hyp_score))
                else:
                    new_hypothesis.append(new_hypo)
                    live_hyp_index.append(prev_hyp_index)
                    new_hyp_scores.append(new_hyp_score)
            if len(completed_hypotheses) == beam_size:
                break
            live_hyp_index = torch.tensor(live_hyp_index,
                                          dtype=torch.long,
                                          device=self.device)
            dec_hidden_tm1 = dec_hidden_tm1[live_hyp_index]
            ctxt_tm1 = ctxt_t[live_hyp_index]

            hypotheses = new_hypothesis
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        has_comp = True
        if len(completed_hypotheses) == 0:
            has_comp = False
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))
        completed_hypotheses.sort(key=lambda x: x.score, reverse=True)
        return completed_hypotheses, has_comp

    @property
    def device(self):
        return self.decoder_init_hidden_proj.weight.device

    def save(self, path):
        path = path + ".qg"
        dir = Path(path).parent
        dir.mkdir(parents=True, exist_ok=True)
        state_dict = {}
        state_dict['vocab'] = self.vocab
        state_dict['args'] = self.args
        state_dict['model_state'] = self.state_dict()
        torch.save(state_dict, path)

    @staticmethod
    def load(path, device):
        params = torch.load(path, map_location=device)

        model = QGModel(vocab=params['vocab'],
                        **params['args'])  # type:nn.Module
        model.load_state_dict(params['model_state'])
        return model.to(device)
Beispiel #3
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """

    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for  documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ###
        ### YOUR CODE HERE (~8 Lines)
        ###
        self.embed_size = embed_size
        self.encoder = nn.LSTM(self.embed_size, self.hidden_size, bias=True, bidirectional=True) # do i have to do make self.embed_size? Also I think since bidirectional is specified I don't need to use 2*self.hidden_size
        self.decoder = nn.LSTMCell(self.hidden_size + embed_size, self.hidden_size, bias=True) # need input size, hidden size. I think they are the same, except that for the input you concatenate the embedding for the current word. 
        self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_h 
        self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_c
        self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_attProj Not sure about this one; it seems to actually take two inputs, h^dec_t to the left and h^enc_i to the right.
        self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False) # W_u
        self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # W_vocab. Is len(self.vocab.tgt) the length of the target vocab? that is what we want.
        self.dropout = nn.Dropout(self.dropout_rate) #Dropout layer.
        ###
        ### END YOUR CODE
        ###
        '''TODO - Initialize the following variables:
             self.encoder (Bidirectional LSTM with bias)
             self.decoder (LSTM Cell with bias)
             self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
             self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
             self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
             self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
             self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
             self.dropout (Dropout Layer)
        
         Use the following docs to properly initialize these variables:
             LSTM:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
             LSTM Cell:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
             Linear Layer:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
             Dropout Layer:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout '''





    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(
            -1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores





    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[
        torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.

        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch

        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.

        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None
        ###
        ### YOUR CODE HERE (~ 8 Lines)
        ###
        X = self.model_embeddings.source(source_padded) #I think that's all it is: the embedding object looks up the embeddings for you (see Hwk3_parser_model) 
        Xpacked = nn.utils.rnn.pack_padded_sequence(X, source_lengths) # needs sequence lengths
        enc_hiddens, (last_hidden, last_cell) = self.encoder(Xpacked) # three birds with one stone
        enc_hiddens = nn.utils.rnn.pad_packed_sequence(enc_hiddens) # output is a tuple containing the desired tensor at 0 and source_lengths at 1. We only want the former
        enc_hiddens = enc_hiddens[0].permute(1, 0, 2) # we have to swap the first 2 indices. use permute()
        last_hidden = last_hidden.split(1, 0) #split along the first dimension to yield a tuple
        last_hidden = torch.cat(last_hidden, 2) #concatenate along the third dimension
        last_hidden = last_hidden.squeeze() #squeeze out the singleton dimension
        init_decoder_hidden = self.h_projection(last_hidden) #apply h_projection layer
        last_cell = last_cell.split(1, 0) # same drill as with last_hidden. Can I just do last_cell = torch.cat((last_cell[0, :, :], last_cell[1, :, :]), 1)?
        last_cell = torch.cat(last_cell, 2)
        last_cell = last_cell.squeeze()
        init_decoder_cell = self.c_projection(last_cell)
        dec_init_state = (init_decoder_hidden, init_decoder_cell)
        ###
        ### END YOUR CODE
        ###
        ''' TODO:
             1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
                 src_len = maximum source sentence length, b = batch size, e = embedding size. 
                 Note that there is no initial hidden state or cell for the decoder.

             2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
                 - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
                 - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
                 - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.

             3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
                 - `init_decoder_hidden`:
                     `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
                     Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
                     Apply the h_projection layer to this in order to compute init_decoder_hidden.
                     This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
                 - `init_decoder_cell`:
                     `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
                     Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
                     Apply the c_projection layer to this in order to compute init_decoder_cell.
                     This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        
         See the following docs, as you may need to use some of the following functions in your implementation:
             Pack the padded sequence X before passing to the encoder:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
             Pad the packed sequence, enc_hiddens, returned by the encoder:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
             Tensor Concatenation:
                 https://pytorch.org/docs/stable/torch.html#torch.cat
             Tensor Permute:
                 https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute '''
        return enc_hiddens, dec_init_state





    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop off the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ###
        ### YOUR CODE HERE (~9 Lines)
        ###
        enc_hiddens_proj = self.att_projection(enc_hiddens) #output should be shape (batch size, max source length, hidden size)
        Y = self.model_embeddings.target(target_padded)
        for Y_t in Y.split(1, 0):
            #'iterate over the time dimension of Y'? which one is time? must be tgt_len since that's the one the sentences unfold through
            Y_t = Y_t.squeeze(0) #squeeze the first dimension
            Ybar_t = torch.cat((Y_t, o_prev), 1) #concatenate Y_t with o_prev (has to be perpendicular to batch_size axis, the only one they have same length)
            dec_state, combined_output, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) #use 'step' function to get next decoder cell, state and output
            combined_outputs.append(combined_output) #append o_t (combined_output) to combined_outputs
            o_prev = combined_output #update o_prev to the new o_t
        combined_outputs = torch.stack(combined_outputs) #Use torch.stack to convert combined_outputs from a list of length tgt_len of (batch_size, hidden_size) to single (tgt_len, batch_size, hidden_size) tensor
        ###
        ### END YOUR CODE
        ###
        ''' TODO:
             1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
                 which should be shape (b, src_len, h),
                 where b = batch size, src_len = maximum source length, h = hidden size.
                 This is applying W_{attProj} to h^enc, as described in the PDF.
             2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
                 where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
             3. Use the torch.split function to iterate over the time dimension of Y.
                 Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
                     - Squeeze Y_t into a tensor of dimension (b, e). 
                     - Construct Ybar_t by concatenating Y_t with o_prev.
                     - Use the step function to compute the the Decoder's next (cell, state) values
                       as well as the new combined output o_t.
                     - Append o_t to combined_outputs
                     - Update o_prev to the new o_t.
             4. Use torch.stack to convert combined_outputs from a list length tgt_len of
                 tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
                 where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        
         Note:
            - When using the squeeze() function make sure to specify the dimension you want to squeeze
              over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
           
         Use the following docs to implement this functionality:
             Zeros Tensor:
                 https://pytorch.org/docs/stable/torch.html#torch.zeros
             Tensor Splitting (iteration):
                 https://pytorch.org/docs/stable/torch.html#torch.split
             Tensor Dimension Squeezing:
                 https://pytorch.org/docs/stable/torch.html#torch.squeeze
             Tensor Concatenation:
                 https://pytorch.org/docs/stable/torch.html#torch.cat
             Tensor Stacking:
                 https://pytorch.org/docs/stable/torch.html#torch.stack '''
        return combined_outputs





    def step(self, Ybar_t: torch.Tensor,
             dec_state: Tuple[torch.Tensor, torch.Tensor],
             enc_hiddens: torch.Tensor,
             enc_hiddens_proj: torch.Tensor,
             enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """
        combined_output = None
        ###
        ### YOUR CODE HERE (~3 Lines)
        ###
        dec_state = self.decoder(Ybar_t, dec_state) #apply decoder unit (not decode() function) to Ybar_t and dec_state, yielding new dec_state (dec_state contains both hidden and cell states)
        #print(combined_output) #cols not same
        dec_cell = dec_state[1]; dec_hidden = dec_state[0] #split dec_state into dec_hidden and dec_cell. Should use torch.split()? Should not matter as dec_state is just a tuple
        #print(dec_state) #ALL COLUMNS SAME
        ''' Form of BMM operation: (b, n, m)*(b, m, p) --> (b, n, p).
    With unsqueeze, we have input sizes
	dec_hidden: 		(b, 1, h), 
	enc_hiddens_proj: 	(b, src_len, h).
    and output size
	e_t: 			(b, src_len)		
e_t must come out (b, 1, src_len). So let b = b, n = 1, m = h, and p = src_len. So permute last 2 elements of enc_hiddens_proj before squeeze. '''
        enc_hiddens_proj = enc_hiddens_proj.permute(0, 2, 1) 
        e_t = torch.bmm(torch.unsqueeze(dec_hidden, 1), enc_hiddens_proj) #get attention scores e_t using BMM 
        e_t = e_t.squeeze(1) #remove added dimension
        #print(e_t) #cols not same
        ###
        ### END YOUR CODE
        ###
        ''' TODO:
             1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
             2. Split dec_state into its two parts (dec_hidden, dec_cell)
             3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
                Note: b = batch_size, src_len = maximum source length, h = hidden size.
        
               Hints:
                 - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
                 - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
                 - Use batched matrix multiplication (torch.bmm) to compute e_t.
                 - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
                 - When using the squeeze() function make sure to specify the dimension you want to squeeze
                     over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        
         Use the following docs to implement this functionality:
             Batch Multiplication:
                https://pytorch.org/docs/stable/torch.html#torch.bmm
             Tensor Unsqueeze:
                 https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
             Tensor Squeeze:
                 https://pytorch.org/docs/stable/torch.html#torch.squeeze '''
        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))
        ###
        ### YOUR CODE HERE (~6 Lines)
        ###
        alpha_t = F.softmax(e_t)#, dim=0) 
        a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens)
        ''' Form of BMM operation: (b, n, m)*(b, m, p) --> (b, n, p).
    We have input sizes
	alpha_t: 		(b, 1, src_len), 
	enc_hiddens:	 	(b, src_len, 2h).
    and output size
	a_t: 			(b, 2h)		
a_t must come out (b, 1, 2h). So let b = b, n = 1, m = h, and p = src_len. No permute needed, just squeeze second dimension. '''
        a_t = a_t.squeeze(1)
        U_t = torch.cat((dec_hidden, a_t), 1) # for autograde they say use [dec_hidden, a_t]. dec_hidden is shape (b, h), a_t is (b, 2h), so the cat must be along the '1' dimension
        V_t = self.combined_output_projection(U_t)
        tanh = nn.Tanh()
        V_t = tanh(V_t)
        O_t = self.dropout(V_t) #ALL SANITY CHECKS PASSED... though i feel insane now
        ###
        ### END YOUR CODE
        ###
        ''' TODO:
             1. Apply softmax to e_t to yield alpha_t
             2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
                 attention output vector, a_t.
             Hints:
                   - alpha_t is shape (b, src_len)
                   - enc_hiddens is shape (b, src_len, 2h)
                   - a_t should be shape (b, 2h)
                   - You will need to do some squeezing and unsqueezing.
             Note: b = batch size, src_len = maximum source length, h = hidden size.
        
             3. Concatenate dec_hidden with a_t to compute tensor U_t
             4. Apply the combined output projection layer to U_t to compute tensor V_t
             5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        
         Use the following docs to implement this functionality:
             Softmax:
                 https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
             Batch Multiplication:
                https://pytorch.org/docs/stable/torch.html#torch.bmm
             Tensor View:
                 https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
             Tensor Concatenation:
                 https://pytorch.org/docs/stable/torch.html#torch.cat
             Tanh:
                 https://pytorch.org/docs/stable/torch.html#torch.tanh '''
        combined_output = O_t
        return dec_state, combined_output, e_t





    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)





    def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[
        Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x, h_tm1,
                                                exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses





    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device





    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model





    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size,
                         dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)
Beispiel #4
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        self.encoder = nn.LSTM(
            embed_size, self.hidden_size, bidirectional=True, batch_first=True
        )  #lstm的参数(emdding_size,hidden_size,) 注意,时间序列不属于网络架构
        self.decoder = nn.LSTM(embed_size + self.hidden_size,
                               self.hidden_size,
                               batch_first=True)
        self.h_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(2 * self.hidden_size,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(3 * self.hidden_size,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt))
        self.dropout = nn.Dropout(p=self.dropout_rate)

        ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)  #(1, len(tgt_vocab))

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        # print('P.shape: {}'.format(P.shape),'\ntget_padded.shape: {}'.format(target_padded.shape), '\nrget_masks.shape: {}'.format(target_masks.shape))

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P.squeeze(dim=2), index=target_padded[1:].unsqueeze(-1), dim=-1
        ).squeeze(-1) * target_masks[1:]  #(sequence=1, len(tgt_vocab))
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        x = self.model_embeddings.source(source_padded).permute(
            1, 0, 2)  #permute前:(inputshape(scr_len,batch_size),embedding_size)

        x = pack_padded_sequence(
            x, source_lengths, batch_first=True)  #打包(batch,len,embdding_size)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(
            x)  #x,output(batch, seq, feature) ???这里是否要写入变量hidden和cell呢
        enc_hiddens = pad_packed_sequence(
            enc_hiddens, batch_first=True)[0]  #所有的hidden 都在这里了

        last_hidden = torch.cat((last_hidden[0, :, :], last_hidden[1, :, :]),
                                1)
        init_decoder_hidden = self.h_projection(last_hidden).unsqueeze(0)

        last_cell = torch.cat((last_cell[0, :, :], last_cell[1, :, :]), 1)
        init_decoder_cell = self.c_projection(last_cell).unsqueeze(0)

        dec_init_state = [init_decoder_hidden, init_decoder_cell]

        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack
        enc_hiddens_proj = self.att_projection(
            enc_hiddens)  #再点乘dec_hidden得到attention score

        y = self.model_embeddings.target(target_padded).permute(
            1, 0, 2)  #(batch, sequence, embdding_size)

        for t in range(len(y[1])):
            y_t = torch.split(y, 1,
                              dim=1)[t]  #一个词一个词丢进去(batch, 1, embedding_size)
            y_t = torch.squeeze(y_t, dim=1)  #(batch, embedding_size_)

            ybar_t = torch.cat((y_t, o_prev),
                               1)  #(batch_size, embedding_size + hidden_size)

            dec_state, combined_output, e_t = self.step(
                ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)

            combined_outputs.append(combined_output)
            o_prev = combined_output

        combined_outputs = torch.stack(combined_outputs,
                                       dim=0).unsqueeze(dim=2)
        #本来是一个列表,每一项都是大小相同的tensor,代表每个词的输出,将他在第0维堆起来(sequence, batch, hidden_size)

        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        #Ybar_t(batch, feature)
        output, dec_state = self.decoder(
            Ybar_t.unsqueeze(dim=1),
            dec_state)  #输入Ybar_t必须是三个维(batch, seq, feature)
        dec_hidden, dec_cell = dec_state[0], dec_state[
            1]  #size:(sequence = 1, batch, hidden_size)
        dec_state = (dec_state[0].permute(1, 0,
                                          2), dec_state[1].permute(1, 0, 2))

        e_t = torch.bmm(
            enc_hiddens_proj,
            dec_hidden.squeeze(dim=0).unsqueeze(dim=2)).squeeze(
                2
            )  # enc_hiddens_proj:(batch_size, encode_sequence, hidden_size)
        #要得到dec_hidden:(batch, hidden_size, 1),先移除1 再增加1 ,e_t:(batch, encode_sequence, 1)

        ### END YOUR CODE

        # Set e_t to -inf (最大下界)where enc_masks has 1, 使pad token处得到的attention score最小,以至于使注意力不会放在这些地方
        #如果全部设置为零,会使长短句的结果出现偏差.短句的attention output会小一些,具体怎么偏差,我也不知道阿
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.to(torch.bool), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
        alpha_t = F.softmax(e_t, dim=1).unsqueeze(
            1)  #(batch, decoder_sequence=1, encoder_sequence)

        a_t = torch.bmm(
            alpha_t, enc_hiddens
        )  #(batch, 1, sequence).*(batch, sequence, hidden*2) = (batch, 1, hidden*2)

        u_t = torch.cat((a_t, dec_hidden.permute((1, 0, 2))), dim=2)
        v_t = self.combined_output_projection(u_t)
        O_t = torch.tanh(v_t)
        O_t = self.dropout(O_t)  #(batch, sequence=1. hidden)

        ### END YOUR CODE

        combined_output = O_t.squeeze(dim=1)  #希望的输出是(batch, hidden_size)
        return dec_state, combined_output, e_t  #原来写的e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)
        #source sentence 转化为张量

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])  #encoding
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec  #decode 的hidden 和 cell
        # 报错,转换一下hidden 和 cell 的维度
        #h_tm1 = (h_tm1[0].permute((1, 0, 2)), h_tm1[1].permute((1, 0, 2)))

        att_tm1 = torch.zeros(1, self.hidden_size,
                              device=self.device)  #attention 参数(1,h)

        eos_id = self.vocab.tgt['</s>']  #start

        hypotheses = [['<s>']]  #用来存每一步预测的字符
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)  #初始化每个预测字符的分数为0
        #y_tm1 = torch.zeros([1, len(hypotheses)], dtype=torch.long, device=self.device)
        completed_hypotheses = []  #确定的结果

        t = 0
        while len(
                completed_hypotheses
        ) < beam_size and t < max_decoding_time_step:  #每一步:当确定的的字符的数量小于beamz_size
            t += 1
            hyp_num = len(hypotheses)  #当前预测的数量

            exp_src_encodings = src_encodings.expand(
                hyp_num, src_encodings.size(1), src_encodings.size(2)
            )  #expand(len(hypotheses), len(sequence)=26, 2*hiddend_siz=512)
            #取前len(预测数量)个 enconding的参数

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))
            # 取前len(预测数量)个attention的参数

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)  #存入预测的目标词的张量
            y_t_embed = self.model_embeddings.target(y_tm1)  #词嵌入

            x = torch.cat([y_t_embed, att_tm1],
                          dim=-1)  #目标词embedding 和 attention 结合

            (h_t, cell_t), att_t, _ = self.step(
                x,
                h_tm1,
                exp_src_encodings,
                exp_src_encodings_att_linear,
                enc_masks=None
            )  #对x即预测词进行处理,得到attention ouput 和 target hiddend state

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(
                completed_hypotheses)  #算一下还剩多少个可以预测的位置
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(
                    -1)  #将每个分数转化为与log同大小,再相加(len(hyp))
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)
            ###topk:Returns the k largest elements of the given input tensor along a given dimension.返回value和indices

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)  #每个词/总词汇量
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            #loop算每个词
            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]  #得到预测的词
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word
                                                          ]  #将这个词加入到预测的句子中
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                #如果这个词是开始词,说明已经完事了,然后就将所有的词和分数加入完整的预测中

                else:  #对新的预测进行一系列的append
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(
                live_hyp_ids, dtype=torch.long,
                device=self.device)  #将预测词的indices 转化为张量

            h_tm1_1 = h_t[live_hyp_ids].permute(1, 0, 2)  #hidden_state
            cell_t_1 = cell_t[live_hyp_ids].permute(1, 0, 2)
            c = ((h_tm1_1, cell_t_1))
            #import pdb

            #pdb.set_trace()
            h_tm1 = c

            att_tm1 = att_t[live_hyp_ids]  #更新 attention output

            hypotheses = new_hypotheses  #更新预测

            hyp_scores = new_hyp_scores
            hyp_scores = torch.tensor(hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)  #将预测的分数存为张量

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))  #加入开始词

        completed_hypotheses.sort(key=lambda hyp: hyp.score,
                                  reverse=True)  #对所有结果的分数进行排序

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Beispiel #5
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for  documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               num_layers=1,
                               bias=True,
                               bidirectional=True)
        self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size,
                                   hidden_size=hidden_size,
                                   bias=True)
        self.h_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(in_features=2 * hidden_size,
                                        out_features=hidden_size,
                                        bias=False)
        self.combined_output_projection = \
            nn.Linear(in_features=3 * hidden_size,
                      out_features=hidden_size,
                      bias=False)
        self.target_vocab_projection = \
            nn.Linear(in_features=hidden_size,
                      out_features=len(vocab.tgt),
                      bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the
        log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens,
        wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b,
        ) representing the
                                    log-likelihood of generating the
                                    gold-standard target sentence for
                                    each example in the input batch. Here b =
                                    batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source,
                                                       device=self.device)  #
        Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target,
                                                       device=self.device)  #
        Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling
        # `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling
        # `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling
        # `self.decode()`
        ###     4. Compute log probability distribution over the target
        # vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) \
            -> \
                    Tuple[
                        torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden
        states.
            Additionally, take the final states of the encoder and project
            them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with
        shape (src_len, b), where
                                        b = batch_size, src_len = maximum
                                        source sentence length. Note that
                                       these have already been sorted in
                                       order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of
        the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b,
        src_len, h*2), where
                                        b = batch size, src_len = maximum
                                        source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors
        representing the decoder's initial
                                                hidden state and cell.
        """
        X = self.model_embeddings.source(source_padded)
        packedX = pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(packedX)
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True)
        last_hidden_bx2h = torch.cat(torch.unbind(last_hidden, dim=0), dim=1)
        init_decoder_hidden = self.h_projection(last_hidden_bx2h)
        last_cell_bx2h = torch.cat(torch.unbind(last_cell, dim=0), dim=1)
        init_cell_hidden = self.c_projection(last_cell_bx2h)
        dec_init_state = (init_decoder_hidden, init_cell_hidden)
        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source
                                     sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source
                                     sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell
        for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences
        (tgt_len, b), where
                                       tgt_len = maximum target sentence
                                       length, b = batch size.

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len,
        b,  h), where
                                        tgt_len = maximum target sentence
                                        length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        combined_outputs = []
        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        for Y_t in torch.split(Y, 1, dim=0):
            Y_t = torch.squeeze(Y_t)
            Ybar_t = torch.cat((Y_t, o_prev), dim=1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens,
                                          enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t
        return torch.stack(combined_outputs)

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the
        attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev],
        with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size,
                                h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with
        shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is
                decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape
        (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length,
                                    h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor,
        projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum
                                    source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum
                                    source length.

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both
        shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is
                decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep
        t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention
        scores distribution.
                                Note: You will not use this outside of this
                                function.
                                      We are simply returning this value so
                                      that we can sanity check
                                      your implementation.
        """

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state
        dec_hidden = torch.unsqueeze(dec_hidden, dim=1)
        enc_hiddens_proj = enc_hiddens_proj.permute(0, 2, 1)
        e_t = torch.bmm(dec_hidden, enc_hiddens_proj)
        e_t = torch.squeeze(e_t, dim=1)
        dec_hidden = torch.squeeze(dec_hidden, dim=1)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        alpha_t = (nn.Softmax(dim=0))(e_t)
        alpha_t = torch.unsqueeze(alpha_t, dim=1)
        a_t = torch.bmm(alpha_t, enc_hiddens)
        a_t = torch.squeeze(a_t, dim=1)
        U_t = torch.cat((dec_hidden, a_t), dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h),
        where b = batch size,
                                     src_len = max source length, h = hidden
                                     size.
        @param source_lengths (List[int]): List of actual lengths for each of
        the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b,
        src_len),
                                    where src_len = max source length,
                                    h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding
        translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to
        unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis,
        each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as
                a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(
                completed_hypotheses) < beam_size and t < \
                max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Beispiel #6
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        ###################################################################################################################################################################
        ############
        ### Step 1: Initiate the entire Encoder network ###
        ############

        # nn.LSTM is a multi-layer long short-term memory (LSTM) RNN
        # input_size: The number of expected features in the input
        # hidden_size: The number of features in the hidden state
        # no need to specify number of time-steps
        # For the Encoder network, we use nn.LSTM because we only want the output of the entire network

        # Encoder network
        # Input of each bidirectional LSTM is the word embedding vector (shape 1 x e)
        # Output of each bidirectional LSTM is the hidden state and cell state of each LSTM (shape 1 x h)
        # The forward and backward LSTM outputs will be further concatenated resulting in shape (1 x 2h). The concatenation will be done seperately.

        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               bias=True,
                               bidirectional=True)

        ############
        ### Step 2: Initiate the structure of a LSTMCell for the Decoder network ###
        ############

        # nn.LSTMCell is a single long short-term memory (LSTM) cell (single time-step)
        # input_size: The number of expected features in the input
        # hidden_size: The number of features in the hidden state
        # For the Decoder network, we use nn.LSTMCell because need to compute the output prediction at each time-step (LSTM cell)

        # A LSTMCell for the Decoder network
        # The input will be the concatenation of the output vector from the previous LSTM time-step (shape 1 x h) and the input word embedding vector (shape 1 x e) at current step
        # The output will be the hidden state and cell state (both shape 1 x h in PDF)

        self.decoder = nn.LSTMCell(input_size=hidden_size + embed_size,
                                   hidden_size=hidden_size,
                                   bias=True)

        ############
        ### Step 3: Initiate the Decoder network's first hidden state and cell state ###
        ############

        # We initiate the Decoder network's first hidden state and cell state with a linear projection (no activation) of the Encoder's final hidden state and final cell state
        # Linear projection means no activation, just multiplied by the weight matrix

        # Layer input: Concatenated bidirection hidden/cell state vector of the last layer of the Encoder network (1 x 2h)
        # W_{h/c}: Linear layer below (shape 2h x h)
        # Layer output: First hidden/cell state of Decoder network (shape 1 x h)

        self.h_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)

        ############
        ### Step 4: Initiate the linear Attention Project Layer ###
        ############

        # We implement multiplicative attention (lecture slide# 78) through a linear layer

        # Layer input: the hidden state vector of Encoder network h_{enc} (shape src_len x 2h)
        # Layer output will be further multiplied by the hidden state of one Decoder time-step h_{dec}.T (shape 1 x h)
        # The multiplication result e is the attention score vector (size m x 1 in PDF)

        # Layer input h_{enc}: shape (src_len x 2h)
        # W_{att_projection}: Linear layer below (shape 2h x h)
        # Layer output: shape (src_len x h), which is a “liner projection” of the hidden state vector of the entire Encoding network

        self.att_projection = nn.Linear(in_features=2 * hidden_size,
                                        out_features=hidden_size,
                                        bias=False)

        ############
        ### Step 5: Initiate the Attention Output Layer ###
        ############

        ## A softmax activation is not initialized here but will be done when the model object is built to normalize attention distribution for current Decoder LSTM step. The output of the softmax function is denoted as alpha_t in PDF (shape 1 x src_len) ##

        ## The output of the softmax function (shape 1 x src_len) will be multiplied by all the hidden states of the Encoder LSTM steps (shape src_len x 2h) and the result is the attention output for the current Decoder LSTM step (shape 1 x 2h)##

        # We concatenate the attention output for the current Decoder LSTM step (shape 1 x 2h) and hidden state output of the current LSTM step in Decoder network (shape 1 x h) and run the result (shape 1 x 3h) through a linear layer to get the output vector of the current Decoder LSTM step

        # Layer input: concatenated attention output vector and hidden state of current Decoder time-step (shape 1 x 3h)
        # W_{u}: Linear layer below (shape 3h x h)
        # Layer output: V_t, almost the output of one Decoder time-step (shape 1 x h), still need to go through a couple of processes to be the final output of one Decoder time-step

        self.combined_output_projection = nn.Linear(3 * hidden_size,
                                                    hidden_size,
                                                    bias=False)

        ############
        ### Step 6: Initiate Dropout ###
        ############

        self.dropout = nn.Dropout(p=self.dropout_rate)

        ############
        ### Step 7: Initiate a linear layer before the final softmax function ###
        ############

        # Then, we produce a probability distribution over target words at the current Decoder LSTM step through a softmax function
        # The softmax function is not initialized here but will be done when the model object is built
        # The output vector should go through a linear layer below before the softmax activation

        # Layer input: o_t, almost the output of one Decoder time-step (shape 1 x h)
        # W_{vocab}: Linear layer below (shape h x len(vocab.tgt))
        # Layer output: final prediction of one Decoder time-step (shape 1 x len(vocab.tgt)) -- one hot vector of the predicted word

        self.target_vocab_projection = nn.Linear(hidden_size,
                                                 len(vocab.tgt),
                                                 bias=False)

        ###################################################################################################################################################################

        ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        ###################################################################################################################################################################
        ############
        ### Step 1: Get word embeddings for the source sentences ###
        ############

        # nn.Embedding:
        # Input: torch.LongTensor (number of indices to extract per mini-batch, mini-batch size)
        # Output: (number of indices to extract per mini-batch, mini-batch size, embedding_dim)

        # source_padded: shape (max_seq_len, batch_size) = (src_len, b)

        # X: shape (max_seq_len, batch_size, embedding_dim) = (src_len, b, e)
        # Notice here the change between input and output of nn.Embedding is just the additional word embedding in the last dimension
        # X is essential the word embedding vectors of all Encoder time-steps for one mini-batch
        # X will act as the input to the Encoder network, self.encoder = nn.LSTM() defined in __init__()

        X = self.model_embeddings.source(source_padded)

        ############
        ### Step 2: Feed word embeddings into Encoding network to get output, last hidden state, and last cell state of Encoder network ###
        ############

        # pack_padded_sequence: https://github.com/HarshTrivedi/packing-unpacking-pytorch-minimal-tutorial

        # Remove padding from X so that when we later feed it into RNN, the paddings will not be computed as input to LSTM steps
        # packed_input.data.shape: (unpadded_sum_seq_len, embedding_dim)

        packed_input = pack_padded_sequence(X, torch.Tensor(source_lengths))

        # LSTM: https://stackoverflow.com/questions/48302810/whats-the-difference-between-hidden-and-output-in-pytorch-lstm
        # Inputs: input, (h_0, c_0)
        # If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
        # Outputs: output, (h_n, c_n)
        # h_n: shape (num_layers * num_directions, batch, hidden_dim) = (1 * 2, b, h)
        # c_n: shape (num_layers * num_directions, batch, hidden_dim) = (1 * 2, b, h)

        # last_hidden: shape (2, b, h)
        # last_cell: shape (2, b, h)
        # packed_output.data.shape : (unpadded_sum_seq_len, hidden_dim)

        packed_output, (last_hidden, last_cell) = self.encoder(packed_input)

        ############
        ### Step 3: Post dimensionality processing of output of Encoder network: enc_hiddens ###
        ############

        # Unpack output to gain padding
        # enc_hiddens.shape : (max_seq_len, batch_size, hidden_dim) = (src_len, b, 2h)
        enc_hiddens, _ = pad_packed_sequence(packed_output)

        # convert it to shape (batch_size, max_seq_len, hidden_dim) = (b, src_len, 2h)
        enc_hiddens = enc_hiddens.transpose(0, 1)

        ############
        ### Step 4: Post dimensionality processing of last hidden state, and last cell state of Encoder network: last_hidden, last_cell ###
        ############

        # last_hidden and last_cell have shape (2, b, h). The 0-dim has number 2, meaning that it contains the hidden state or cell state of both directions. We need to manually concatenate them

        # convert it to shape (batch_size, hidden_dim)
        # torch.cat(tensors, dim=0, out=None) → Tensor
        # dim: the dimension over which the tensors are concatenated
        # last_hidden: shape (2, b, h)
        # last_hidden[0], last_hidden[1]: shape (b, h)
        # output: shape (b, 2h)

        # Essentially, we are concatenating the two directional hidden states of the Encoder network and concatenating the two directional cell states of the Encoder network

        last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1)
        last_cell = torch.cat((last_cell[0], last_cell[1]), 1)

        ############
        ### Step 5: Compute the Decoder network's first hidden state and cell state with a linear projection of the Encoder's final hidden state and final cell state ###
        ############

        # last_hidden: shape (b, 2h)
        # init_decoder_hidden: shape (b, h)
        init_decoder_hidden = self.h_projection(last_hidden)
        init_decoder_cell = self.c_projection(last_cell)

        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        ###################################################################################################################################################################

        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack
        ###################################################################################################################################################################
        ############
        ### Step 1: Feed Encoder hidden state into the attention projection layer to obtain the attention output ###
        ############

        # enc_hiddens_proj will be used for computing attention scores in later steps

        # enc_hiddens: shape (b, src_len, 2h)
        # W_{attProj}: shape (2h, h)
        # enc_hiddens_proj = enc_hiddens · W_{attProj}: shape (b, src_len, h)
        enc_hiddens_proj = self.att_projection(enc_hiddens)

        ############
        ### Step 2: Get word embeddings for the target sentences for the Decoder network ###
        ############

        # Y: shape (tgt_len, b, e)
        Y = self.model_embeddings.target(target_padded)

        ############
        ### Step 3: Step through the time steps in the Decoder network ###
        ############

        # tensor.split() example: https://blog.csdn.net/weixin_44613063/article/details/89576810
        # tensor.split(size of each piece, dimension to be split)
        # here we split Y's 0-dim into pieces of size 1
        for Y_t in Y.split(1, dim=0):
            ############
            ### Step 3.1: Remove redundant dimension from Y_t ###
            ############

            # torch.squeeze() example: https://jamesmccaffrey.wordpress.com/2019/07/02/the-pytorch-view-reshape-squeeze-and-flatten-functions/
            # In some sense a dimension with size 1 is useless. The squeeze() function eliminate any dimension that has size 1
            # We can also pass the argument to specify which dimension to squeeze
            # Y_t is the word embedding vectors of a mini-bath at one time-step
            # Y_t: shape (1, b, e) → Y_t_squeezed: shape (b, e)
            Y_t_squeezed = Y_t.squeeze(dim=0)

            ############
            ### Step 3.2: Concatenate the word embedding input at current time-step and the predicted output at previous time-step to be the LSTM input of current time-step ###
            ############

            # torch.cat() example: https://blog.csdn.net/weixin_44613063/article/details/89576810
            # torch.cat(tensors to concatenate, dimension to concatenate) → Tensor
            # o_prev: the predicted output at previous Decoder time-step
            # o_prev was initialized as shape (b, h) with torch.zeros()
            # Y_t_squeezed: the word embedding input at current time-step, shape (b, e)
            # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h)
            Ybar_t = torch.cat((Y_t_squeezed, o_prev), dim=1)

            ############
            ### Step 3.3: Compute one forward step of the LSTM decoder, including the attention computation to get output and new hidden state at current time-step ###
            ############

            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens,
                                          enc_hiddens_proj, enc_masks)

            ############
            ### Step 3.4: Update the Decoder output vector (that contains output of all Decoder time-steps) to incldue the output of the current time-step ###
            ############

            # combined_outputs is a list that stores Decoder outputs. when we finish each Decoder time-step, we append the output to it
            # o_t: output at current Decoder time-step, shape (b, h)
            combined_outputs.append(o_t)

            ############
            ### Step 3.5: Update variable that stores the output of previous time-step ###
            ############

            o_prev = o_t

        ############
        ### Step 4: Reshape the Decoder output vector combined_outputs to (tgt_len, b, h) ###
        ############

        # before stacking, the length of 0-dim of combined_outputs is the number of time-steps in Decoder network
        combined_outputs = torch.stack(combined_outputs, dim=0)
        ###################################################################################################################################################################

        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###################################################################################################################################################################
        ############
        ### Step 1: Apply the input (concatenation of word embedding input at current time-step and output at previous time-step) into Decoder LSTMCell to get new output at current time-step ###
        ############

        # LSTMCell
        # Inputs: input, (h_0, c_0)
        # Outputs: (h_1, c_1)

        # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h)
        # dec_state as input contains both hidden state and cell state, hidden state and cell state both are shape (b, h)
        # dec_state as output: shape (2, b, h)

        dec_state = self.decoder(Ybar_t, dec_state)

        ############
        ### Step 2: Split dec_state into its two parts (dec_hidden, dec_cell) ###
        ############

        # dec_hidden, dec_cell: shape (b, h)
        (dec_hidden, dec_cell) = dec_state

        ############
        ### Step 3: Compute attention score vector for the current time-step ###
        ############

        # We multiply the hidden state vector “projection” of the entire Encoding network by the hidden state of the current time-step in Decoder network to get the attention score vector for the current time-step

        # enc_hiddens_proj: shape (b, src_len, h)
        # dec_hidden: shape (b, h)
        # torch.unsqueeze(dec_hidden, 2): shape (b, h, 1)
        # torch.bmm(input, mat2, out=None) → Tensor
        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        # enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)): shape (b, src_len, 1)
        # e_t: shape (b, src_len)
        # e_t contains the attentions score of each time-step in Encoding network on the current one time-step in Decoder network
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        ###################################################################################################################################################################

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh
        ###################################################################################################################################################################
        ############
        ### Step 1: Compute attention distribution alpha_t for the current time-step ###
        ############

        # Softmax converts all attentions scores into values between [0, 1] and add up to 1

        # e_t: shape (b, src_len)
        # alpha_t: shape (b, src_len)
        alpha_t = F.softmax(e_t, dim=1)

        ############
        ### Step 2: Compute attention output a_t for the current time-step ###
        ############

        # We multiply the attention distribution vector by the hidden state vector of the entire Encoding network to get the attention output for the current time-step in Decoder network

        # torch.bmm(input, mat2, out=None) → Tensor
        # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor.
        # alpha_t: shape (b, src_len)
        # alpha_t.unsqueeze(1): shape (b, 1, src_len)
        # enc_hiddens: shape (b, src_len, 2h)
        # alpha_t.unsqueeze(1).bmm(enc_hiddens): shape (b, 1, 2h)
        # a_t: shape (b, 2h)
        a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1)

        ############
        ### Step 3: Concatenate attention output a_t with the hidden state of current Decoder time-step ###
        ############

        # U_t contains information from both the hidden state of current Decoder time-step and the attention from the Encoder network

        # dec_hidden: shape (b, h)
        # U_t: shape (b, 3h)
        U_t = torch.cat((a_t, dec_hidden), dim=1)

        ############
        ### Step 4: We pass the concatenated result through a linear layer ###
        ############

        V_t = self.combined_output_projection(U_t)

        ############
        ### Step 5: We apply tanh activation for the linear layer output and apply dropout to obtain the combined output vector O_t ###
        ############

        O_t = self.dropout(torch.tanh(V_t))

        ###################################################################################################################################################################

        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Beispiel #7
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """

    def __init__(self, src_embed_size, dst_embed_size, hidden_size, vocab, dropout_rate=0.2, use_pos_embed=False,
                 use_copy=False, max_src_len=48):
        """ Init NMT Model.

        @param src_embed_size (int): encoder Embedding size (dimensionality)
        @param dst_embed_size (int): decoder Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for  documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(src_embed_size, dst_embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        self.use_pos_embed = use_pos_embed
        self.use_copy = use_copy
        self.max_src_len = max_src_len

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###

        self.encoder = nn.LSTM(src_embed_size, hidden_size, num_layers=1, bias=True, bidirectional=True)
        self.decoder = nn.LSTMCell(dst_embed_size+hidden_size, hidden_size, bias=True)
        self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) if not use_pos_embed else \
            nn.Linear(2 * hidden_size+dst_embed_size, hidden_size, bias=False)
        if use_pos_embed:
            self.combined_output_projection = nn.Linear(3 * hidden_size+dst_embed_size, hidden_size, bias=False)
        elif use_copy:
            self.combined_output_projection = nn.Linear(3 * hidden_size + max_src_len, hidden_size, bias=False)
        else:
            self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)
        # self.device = torch.device('cpu')



    def forward(self, source: torch.Tensor, source_lengths: List[int], target: torch.Tensor) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (Tensor): padded source sentences with shape (src_len, b)
        @param source_lengths (List[int]): list of source sentence lengths
        @param target (Tensor): padded target sentences with shape (src_len, b)

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        source_padded = source
        target_padded = target
        src_len = source.size(0)
        batch_size = source.size(1)


        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        # target_masks = (target_padded != self.vocab.tgt['<pad>']).float() #dst_pad_token_idx
        target_masks = (target_padded != self.vocab.dst_pad_token_idx).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(
            -1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[
        torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        #convert sentences to indexes
        X = self.model_embeddings.source(source_padded)
        input = torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(input)
        enc_hiddens, _ = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens, batch_first=True)

        init_decoder_hidden = self.h_projection(torch.cat([last_hidden[0], last_hidden[1]], 1))
        init_decoder_cell = self.c_projection(torch.cat([last_cell[0], last_cell[1]], 1))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)


        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size.

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        src_len = enc_hiddens.size(1)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e). 
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###

        combined_inputs = enc_hiddens
        if self.use_pos_embed:
            input_pos_embed = self.create_input_pos_embed(src_len, batch_size)
            combined_inputs = torch.cat((enc_hiddens, input_pos_embed), dim=2)
            enc_hiddens_proj = self.att_projection(combined_inputs)
        else:
            enc_hiddens_proj = self.att_projection(enc_hiddens)

        # print("Device is: {}".format(self.device))
        # target_indexes = None
        # if torch.cuda.is_available():
        #     target_indexes = torch.cuda.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist())).to(self.device)
        # else:
        #     target_indexes = torch.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist())).to(self.device)
        # target_indexes = torch.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist()))
        # Y = self.model_embeddings.target(target_indexes)
        Y = self.model_embeddings.target(target_padded)

        for Y_t in torch.split(Y, 1, dim=0):
            Y_t = torch.squeeze(Y_t, dim=0)
            Ybar_t = torch.cat([Y_t, o_prev], dim=1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, combined_inputs, enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs, dim=0)

        return combined_outputs

    def step(self, Ybar_t: torch.Tensor,
             dec_state: Tuple[torch.Tensor, torch.Tensor],
             enc_hiddens: torch.Tensor,
             enc_hiddens_proj: torch.Tensor,
             enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor (+ input_pos_embed if use_pos_embed), with shape (b, src_len, h * 2 (+ dsb_embed_size)), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None


        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len). 
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###

        dec_hidden, dec_cell = self.decoder(Ybar_t, dec_state)
        dec_state = (dec_hidden, dec_cell)
        e_t = torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2))
        e_t = torch.squeeze(e_t, 2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.

        soft_max = torch.nn.Softmax(1)
        alpha_t = soft_max(e_t)

        a_t = torch.bmm(torch.unsqueeze(alpha_t, 1), enc_hiddens)
        a_t = torch.squeeze(a_t, 1)

        # U_t = torch.cat([a_t, dec_hidden], 1)
        if self.use_copy:
            U_t = torch.cat([dec_hidden, a_t], 1)
            src_len = alpha_t.size(1)
            if src_len == self.max_src_len:
                alpha_t_ext = alpha_t
            else:
                batch_size = alpha_t.size(0)
                ext = torch.zeros(batch_size, (self.max_src_len - src_len)).to(self.device)
                alpha_t_ext = torch.cat([alpha_t, ext], 1)
            U_t = torch.cat([dec_hidden, a_t, alpha_t_ext], 1) # attention based copy
        else:
            U_t = torch.cat([dec_hidden, a_t], 1)

        V_t = self.combined_output_projection(U_t)

        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self, src_sent: torch.Tensor, src_sent_len: int, beam_size: int = 5, max_decoding_time_step: int = 30) -> List[
        Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (Tensor): a single source sentence (words) with shape (sentence len, )
        @param src_sent_len int; src_sent length: int
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        # src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)
        src_sents_var = torch.unsqueeze(src_sent, 1)

        # src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings, dec_init_vec = self.encode(src_sents_var, [src_sent_len])

        # src_encodings_att_linear = None
        combined_inputs = src_encodings
        if self.use_pos_embed:
            input_pos_embed = self.create_input_pos_embed(src_sent_len, 1)
            combined_inputs = torch.cat((src_encodings, input_pos_embed), dim=2)
            src_encodings_att_linear = self.att_projection(combined_inputs)
        else:
            src_encodings_att_linear = self.att_projection(src_encodings)


        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        # eos_id = self.vocab.tgt['</s>']
        eos_id = self.vocab.dst_eos_token_idx

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = combined_inputs.expand(hyp_num,
                                                     combined_inputs.size(1),
                                                     combined_inputs.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor([self.vocab.tgt.stoi[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x, h_tm1,
                                                exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.itos[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str, use_pos_embed: bool, use_copy: bool):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], use_pos_embed=use_pos_embed, use_copy=use_copy, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(src_embed_size=self.model_embeddings.src_embed_size, dst_embed_size=self.model_embeddings.dst_embed_size,
                         hidden_size=self.hidden_size,
                         dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

    def load_pretrained_embeddings(self, vocab: Vocab):
        self.model_embeddings.source.weight.data.copy_(vocab.src.vectors)

    def create_input_pos_embed(self, src_len, batch_size):
        pos_list = [[self.vocab.tgt.stoi[str(i)] for i in range(0, src_len)]]
        input_positions = torch.LongTensor(pos_list).to(self.device)  # [1, src_len]
        input_positions = input_positions.repeat(batch_size, 1)  # [ batch_size, src_len]
        input_embed = self.model_embeddings.target(input_positions)
        return input_embed # [ batch_size, src_len, dst_embed_size]
Beispiel #8
0
class TransformerNMT(nn.Module):
    def __init__(self,
        vocab, embed_size=512,
        num_hidden_layers = 6,
        num_attention_heads = 8,
        fc_size = 2048,
        dropout_rate = 0.1):
        super(TransformerNMT, self).__init__()

        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.pos_encoder = PositionalEncoding(embed_size, dropout_rate)
        self.vocab = vocab
        self.device = None
        self.d_model = embed_size
        self.encoder = torch.nn.TransformerEncoder(
            torch.nn.TransformerEncoderLayer(
                d_model = embed_size,
                nhead = num_attention_heads,
                dim_feedforward=fc_size,
                dropout=dropout_rate
            ), num_hidden_layers
        )
        self.decoder = torch.nn.TransformerDecoder(
            torch.nn.TransformerDecoderLayer(
                d_model = embed_size,
                nhead = num_attention_heads,
                dim_feedforward=fc_size,
                dropout=dropout_rate
            ), num_hidden_layers
        )
        self.tgt_mask = None
        self.target_vocab_projection = nn.Linear(embed_size, len(vocab.tgt))

    def _generate_square_subsequent_mask(self, sz):
        mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1)
        mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
        return mask

    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        source_lengths = [len(s) for s in source]
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)  # Tensor: (tgt_len, b)

        # X: [seq_length, batch, embed_size]
        X = self.model_embeddings.source(source_padded) * math.sqrt(self.d_model)
        X = self.pos_encoder(X)
        Y = self.model_embeddings.target(target_padded) * math.sqrt(self.d_model)
        Y = self.pos_encoder(Y)

        tgt_key_padding_masks = (target_padded == self.vocab.tgt['<pad>']).T
        
        if self.tgt_mask is None or self.tgt_mask.size(0) != len(Y):
            self.tgt_mask = self._generate_square_subsequent_mask(len(Y)).to(self.device)
            # (self.tgt_mask[:])
        # print(self.tgt_mask)
        src_padding_masks = (source_padded == self.vocab.src['<pad>']).T
        # memory, enc_attn = self.encoder(source_padded, attention_mask=src_padding_masks)
        # memory: [t, batch_size, embed_n]
        memory = self.encoder(X)
        # memory[1] += 1
        # print(memory[:2])
        # Y[1] += 1
        output = self.decoder(
            tgt=Y,
            memory=memory,
            memory_key_padding_mask=src_padding_masks,
            tgt_mask=self.tgt_mask,
            tgt_key_padding_mask=tgt_key_padding_masks)
        # print(memory.shape)
        # tgt_padding_masks = (target_padded != self.vocab.tgt['<pad>'])
        # output, _, dec_attn = self.decoder(target_padded, attention_mask=tgt_padding_masks, encoder_hidden_states= memory)
        # print(output.shape)
        # print(output[:3, 0])
        # print(output.shape)
        # print(output[0,:, :20])
        # output = self.pos_encoder(output)

        tgt_padding_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        # print(tgt_padding_masks.shape)
        # print(self.target_vocab_projection(output).shape)
        out_probs = F.log_softmax(self.target_vocab_projection(output), dim=-1)#.permute(1, 0, 2)
        # print(out_probs.shape)
        # print(target_padded.shape)
        # print(target_padded.T[1:].unsqueeze(-1).shape)
        # print(torch.gather(out_probs[:-1], index=target_padded.T[1:].unsqueeze(-1), dim=-1))
        target_gold_words_log_prob = torch.gather(out_probs[:-1], index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * tgt_padding_masks[1:]
        
        # print()
        # print(target_padded[:2])
        # print(out_probs.shape)
        # print(out_probs[:, :, 3])
        # print(out_probs[:, :, 1503])
        # print(target_gold_words_log_prob.T)
        # print(target_padded)
        # print(out_probs.argmax(-1).T)
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

        # encoded = self.encode(source_padded)
        # enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        # combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        # P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # # Zero out, probabilities for which we have nothing in the target text
        # target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        
        # # Compute log probability of generating true target words
        # target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        # scores = target_gold_words_log_prob.sum(dim=0)
        # return scores

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

    def to(self, device):
        super().to(device)
        self.device = device
        return self
Beispiel #9
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:

        ###     self.encoder (Bidirectional LSTM with bias);
        # our input size is based around the number of features * embeddings usually, but slightly different here
        # for LSTM we are inputting a sentence of size m
        # each word in the sentence is converted to an embedding of size e
        # this embedding of size e is then fed in sequentially, so input size is just size of embedding.

        # I think we just want one layer for the encoder, which is default
        # also need to add dropout
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=self.hidden_size,
                               num_layers=1,
                               bias=True,
                               bidirectional=True)

        ###     self.decoder (LSTM Cell with bias)

        # decoder is initialized with a linear projection of hidden state & cell state from encoder...
        # from what I am reading the output of encoder should be a single vector rather than a series out outputs
        # specifically, our input to decoder is just the final hidden state of encoder
        # we are also going to be feeding in the associated word embedding....which is of size e.
        # so we concatenate the final hidden state of encoder + word embedding e....

        # I am making a guess dimensions of hidden layer size for decoder is still hidden size input to class....
        # helpful reading: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
        self.decoder = nn.LSTMCell(input_size=self.hidden_size + embed_size,
                                   hidden_size=self.hidden_size,
                                   bias=True)

        ### What are these linear layers actually doing?
        # These are building a fully connected layer
        # finding a way to convert encoder's final hidden state to proper dimensions for decoder hidden state?
        # Same with the cell states.

        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        # Applies a linear transformation to the incoming data: y = xA^T + b
        # Based on equation 3 in assignment I think input is 2 * hidden_size, output is hidden_size
        self.h_projection = nn.Linear(in_features=2 * self.hidden_size,
                                      out_features=self.hidden_size,
                                      bias=False)

        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ## same as above: equation 4 shows that this reducing size
        self.c_projection = nn.Linear(in_features=2 * self.hidden_size,
                                      out_features=self.hidden_size,
                                      bias=False)

        #### Attention is happening after decoder input
        # input to attention is hidden_encoding, which is 2 * hidden size. output is m * 1....

        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        # equations 7 - 9 show whats happening here......
        # equation 7 shows theinput to attention projection is h_enc which is of size 2 * hidden_size
        # equation 8 shows the output of vector h x 2h for W_attproj
        self.att_projection = nn.Linear(in_features=2 * self.hidden_size,
                                        out_features=self.hidden_size,
                                        bias=False)

        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        # equation 10 - 11 show us the size here:
        # we concatenate attention output + decoder hidden state; output is size hidden state
        self.combined_output_projection = nn.Linear(
            in_features=3 * self.hidden_size,
            out_features=self.hidden_size,
            bias=False)

        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        # W_vocab is Vt x h, so hidden input, Vt length utput
        # the vocab.tgt is going to spit out entry for target language..source could be used also\
        # should be equivalent to model_embeddings weight shape....one of the aspects of that matrix.
        self.target_vocab_projection = nn.Linear(in_features=self.hidden_size,
                                                 out_features=len(
                                                     self.vocab.tgt),
                                                 bias=False)

        ###     self.dropout (Dropout Layer)
        # equation 12: dropout weight vector is h *1, so input is 1, and output is hidden_size
        self.dropout = nn.Dropout(dropout_rate)

        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        self.dropout = nn.Dropout(dropout_rate)

        ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.

        # model embeddings were initialized as 'self.model_embeddings', just need to read in & specify source
        # looking at other solution it looks like I need to pass in the source_padded....I am not super clear on this
        # Here is my interpretation:
        #     - We create an instance of ModelEmbeddings during __init__ based on our vocab & expected embedding size
        #     - However, we have not yet passed any of the info in yet
        #     - We limit to our instance.source, which is an instance of nn.embedding: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding
        #     - We then need to pass in our tensor of padded source sentences, which are (src_len, b)
        #     - The embedding size (e) is already built into our nn.embedding instance, so output will account for this
        #     - Final shape will be (src_len, b, e)
        X = self.model_embeddings.source(source_padded)

        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.

        # start by padding X: https://pytorch.org/cppdocs/api/function_namespacetorch_1_1nn_1_1utils_1_1rnn_1a6c14a90e57eb631f51f06e52a600f7f7.html
        # expects an input matrix along with lengths of each sentence
        X = pack_padded_sequence(X, source_lengths)

        # now we can apply the encoder: self.encoder
        # this is our instance of the LSTM for encoder, defined during __init__
        # returns: output, (hidden, cell)
        #  (last_hidden, last_cell); unpacking for now since step 3 seems to need this
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)

        # finally, we need to apply to pad_packed_sequence function to enc_hideens
        # https://pytorch.org/cppdocs/api/function_namespacetorch_1_1nn_1_1utils_1_1rnn_1aea13e54d273ec16fa8288e2262c28f30.html
        # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch
        # great overview above - in short its going to help us make more computationally efficient moves
        # The returned Tensor’s data will be of size T x B x *, where T is the length of the longest sequence and B is the batchsize.
        # we don't need to keep the tensor of lengths, only need to actual tensor data
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens)

        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ### permute makes it easy to just rearrange the order of data:
        ## we have: (src_len, b, h*2) and we want: (b, src_len, h*2)....
        ## x.permute(1, 0, 2)
        enc_hiddens = enc_hiddens.permute(1, 0, 2)

        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###

        # we want to go from (2,b,h) -> (b, 2*h); can do this with concatenation
        # I believe we want dim = 1 to stack cols
        cat_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1)

        # now we pass this value into our h_projection layer
        init_decoder_hidden = self.h_projection(cat_hidden)

        # compute last cell, again: we want to go from (2,b,h) -> (b, 2*h)
        cat_cell = torch.cat((last_cell[0], last_cell[1]), 1)

        # Apply the c_projection layer to this in order to compute init_decoder_cell.
        init_decoder_cell = self.c_projection(cat_cell)

        # make a tuple
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute

        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len= maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """

        #### David's interpretation of this step:
        # We first remove the end token (for training?
        # initialize the decoder state as the init state, which is the final hidden / cell state of the Encoder
        # the o_prev is just going to represent combined-output vector, which is the concatenation of the attention + decoder hidden state
        # Note: in the next step we will use decoder hidden state to calculate the attention -- happens in `.step()`
        # for a target sentence in our batch we will iterate through each target word, going through and:
        # - passing in the word embeddings for target
        # - getting updated decoder hidden state & attention vectors
        # - outputting the o_prev, which will eventually be passed through to build out the probability distribution over target words

        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero --> we start with 0s
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.

        enc_hiddens_proj = self.att_projection(enc_hiddens)

        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.

        # we need to convert the target language sentencesinto embeddings.
        Y = self.model_embeddings.target(target_padded)

        ###     3. Use the torch.split function to iterate over the time dimension of Y.

        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.

        # going to separate ot tgt_len, b, e -> size of (1,b,e) for each word in the sentence (example in my testing notebook)
        for Y_t in torch.split(Y, 1):

            # squeeze Y_t into a tensor of dimension (b,e) --> this just removes the 1 in shape
            # For example, if input is of shape: (A \times 1 \times B \times C \times 1 \times D)(A×1×B×C×1×D) then the out tensor will be of shape: (A \times B \times C \times D)(A×B×C×D) .
            Y_t = torch.squeeze(Y_t)

            # Construct Ybar_t by concatenating Y_t with o_prev.
            # assuming we just dimension 1 again - will need to confirm
            Ybar_t = torch.cat((Y_t, o_prev), 1)

            # Use the step function to compute the the Decoder's next (cell, state) values
            # as well as the new combined output o_t.
            # e_t looks like it won't actually be used? just added for debugging
            # we wil get out updated dec_state (used in next iteration) as well as updated o_t, which is stored
            dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens,
                                            enc_hiddens_proj, enc_masks)

            # Append o_t to combined_outputs
            combined_outputs.append(o_t)

            # Update o_prev to the new o_t.
            o_prev = o_t

        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.

        # onto the torch.stack process
        # this can take a list as input and output a stacked tensor...unclear on how to handle dimensions here so using default.
        combined_outputs = torch.stack(combined_outputs)

        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack

        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        # The goal of this section is to build out the decoder hidden state as well as the attention vector

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.

        # the decoder is an instance of 'LSTMCell' -> outputs h_1 (hidden state) , c_1 (cell state)
        # i think I can just stored in `dec_state`, which will be (hidden_state, cell_state)
        dec_state = self.decoder(Ybar_t, dec_state)

        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        dec_hidden, dec_cell = dec_state

        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###

        # Attention equation is broken down in assignment via equations 7 - 9

        # Equation 7:
        # we start by taking dec_hidden to compute multuplicative attention over each hidden unit from encode
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.

        # If input is a (b \times n \times m)(b×n×m) tensor, mat2 is a (b \times m \times p)(b×m×p) tensor, out will be a (b \times n \times p)(b×n×p) tensor.
        # dec_hidden: (b * h)
        # enc_hiddens_proj: (b, src_len, h)

        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###

        # we want dec_hidden to be: (b * h * 1) ---> so must unsqueeze at 3 dimension (referenced as 2)
        dec_hidden_un = torch.unsqueeze(
            dec_hidden, 2)  # should move us from (b*h) -> (b*h*1)

        # enc_hiddens is okay at: (b * src_len * h)

        # we want output e_t to be: (b, src_len), meaning we need to first get: (b, src_len, 1)
        # this would be (b * src_len * h) * (b * h * 1?)
        # can then squeeze output
        e_t = torch.bmm(enc_hiddens_proj, dec_hidden_un)

        # e_t shape is (b, src_len, 1) so need to squeeze to remove last output at the third dimension (referenced as 2
        e_t = torch.squeeze(e_t, 2)

        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)

        ### This is picking back up with the Equation 8

        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t: Equation 8 in pdf
        alpha_t = F.softmax(e_t, dim=1)

        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t. - Equation 9 in the pdf
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###

        # we need to get a_t as our output into (b,2h)
        # need alpha to go from (b,src_len) -> (b, 1, src_len)
        alpha_t_u = torch.unsqueeze(alpha_t, 1)

        # bmm
        a_t = torch.bmm(alpha_t_u, enc_hiddens)

        # we then need to unsqueeze output to go from b, 1, 2h -> b, 2h
        a_t = torch.squeeze(a_t, 1)

        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t: Equation 10
        U_t = torch.cat((a_t, dec_hidden), 1)

        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t: Equation 11
        V_t = self.combined_output_projection(U_t)

        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer: equation 12
        O_t = self.dropout(torch.tanh(V_t))

        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh

        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # Bidirectional LSTM with bias
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=self.hidden_size,
                               bidirectional=True,
                               bias=True)
        # LSTM Cell with bias
        self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size,
                                   hidden_size=self.hidden_size,
                                   bias=True)
        # Linear Layer with no bias, W_{h}
        self.h_projection = nn.Linear(in_features=self.hidden_size * 2,
                                      out_features=self.hidden_size,
                                      bias=False)
        # Linear Layer with no bias, W_{c}
        self.c_projection = nn.Linear(in_features=self.hidden_size * 2,
                                      out_features=self.hidden_size,
                                      bias=False)
        # Linear Layer with no bias, W_{attProj}
        self.att_projection = nn.Linear(in_features=self.hidden_size * 2,
                                        out_features=self.hidden_size,
                                        bias=False)
        # Linear Layer with no bias, W_{u}
        self.combined_output_projection = nn.Linear(
            in_features=self.hidden_size * 3,
            out_features=self.hidden_size,
            bias=False)
        # Linear Layer with no bias, W_{vocab}
        self.target_vocab_projection = nn.Linear(in_features=self.hidden_size,
                                                 out_features=len(vocab.tgt),
                                                 bias=False)
        # Dropout Layer
        self.dropout = nn.Dropout(p=self.dropout_rate)

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### step 1
        ### Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        # src_len = maximum source sentence length, b = batch size, e = embedding size.
        # torch.nn.Embedding is often used to store word embeddings and retrieve them using indices.
        # The input to the module is a list of indices, and the output is the corresponding word embeddings.
        # Input: (*), LongTensor of arbitrary shape containing the indices to extract
        # Output: (*, H), where * is the input shape and H=embedding_dim
        X = self.model_embeddings.source(
            source_padded)  # (src_len, b) -> (src_len, b, e)

        ### step 2
        ### Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        # Remove pad and merge short sequences into one long sequence
        # https://www.cnblogs.com/sbj123456789/p/9834018.html
        # returns a PackedSequence object, which has two attributes : data & batch_size
        X = pack_padded_sequence(X, lengths=source_lengths)
        # Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        # encoder: LSTM
        # Inputs: input, (h_0, c_0); input of shape (seq_len, batch, input_size);
        # The input can also be a packed variable length sequence.
        # h_0 and c_0 are of shape (num_layers * num_directions, batch, hidden_size)
        # If the LSTM is bidirectional, num_directions should be 2, else it should be 1.
        # If (h_0, c_0) is not provided, both h_0 and c_0 default to zero.
        # Outputs: output, (h_n, c_n); output of shape (seq_len, batch, num_directions * hidden_size)
        # If a PackedSequence has been given as the input, the output will also be a packed sequence.
        # h_n and c_0 are of shape (num_layers * num_directions, batch, hidden_size)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)
        # Pads a packed batch of variable length sequences. Inverse operation to pack_padded_sequence().
        enc_hiddens, _ = pad_packed_sequence(
            enc_hiddens)  # (src_len, b, h * 2)
        # Returns a tensor that is a transposed version of input. The given dimensions dim0 and dim1 are swapped.
        enc_hiddens = enc_hiddens.transpose(0, 1)  # (b, src_len, h * 2)

        ### step 3
        ### Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        # concatenates the given sequence of seq tensors in the given dimension.
        # All tensors must either have the same shape (except in the concatenating dimension) or be empty.
        last_hidden = torch.cat((last_hidden[0], last_hidden[1]),
                                1)  # (2, b, h) -> (b, h * 2)
        # h_0^{dec} = W_h[\mathop{h_1^{enc}}^{\leftarrow}, \mathop{h_m^{enc}}^{\rightarrow}]
        init_decoder_hidden = self.h_projection(last_hidden)
        last_cell = torch.cat((last_cell[0], last_cell[1]), 1)
        # c_0^{dec} = W_c[\mathop{c_1^{enc}}^{\leftarrow}, \mathop{c_m^{enc}}^{\rightarrow}]
        init_decoder_cell = self.c_projection(last_cell)
        dec_init_state = (init_decoder_hidden, init_decoder_cell)

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### step 1
        ### Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`.
        # W_{attProj}h_i^{enc}
        enc_hiddens_proj = self.att_projection(
            enc_hiddens
        )  # (b, src_len, h * 2) dot (h * 2, h) -> (b, src_len, h)

        ### step 2
        ### Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        Y = self.model_embeddings.target(
            target_padded)  # (tgt_len, b) -> (tgt_len, b, e)

        ### step 3
        # torch.split(tensor, split_size_or_sections, dim=0)
        # Splits the tensor into chunks. Each chunk is a view of the original tensor.
        # If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible).
        # Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size.
        for Y_t in torch.split(Y, 1, dim=0):  # (tgt_len, b, e) -> (1, b, e)
            # Returns a tensor with all the dimensions of input of size 1 removed.
            squeezed = torch.squeeze(Y_t)  # (1, b, e) -> (b, e)
            Ybar_t = torch.cat((squeezed, o_prev),
                               dim=1)  # (b, e) + (b, h) -> (b, e + h)
            dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens,
                                            enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t

        ### step 4
        # Concatenates a sequence of tensors along a new dimension.
        combined_outputs = torch.stack(
            combined_outputs, dim=0)  # list of (b, h) -> (tgt_len, b, h)

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        # Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        # h_t^{dec}, c_t^{dec} = decoder(\overline(y_t),h_{t-1}^{dec},c_{t-1}^{dec})
        dec_state = self.decoder(Ybar_t, dec_state)
        # Split dec_state into its two parts
        (dec_hidden, dec_cell) = dec_state  # (b, 2 * h) -> ((b, h), (b, h))
        # batched matrix multiplication
        # (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len)
        # unsqueeze - Returns a new tensor with a dimension of size one inserted at the specified position.
        # e_{t, i} = (h_t^{dec})^{\top}W_{attProj}h_i^{enc}
        e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        # \alpha_t = Softmax(e_t)
        alpha_t = torch.unsqueeze(F.softmax(e_t, dim=1),
                                  dim=1)  # (b, src_len) -> (b, 1, src_len)
        # (b, 1, src_len) * (b, src_len, 2*h) -> (b, 1, 2*h) -> (b, 2*h)
        # a_t = \sum_i^m\alpha_{t, i}h_i^{enc}
        a_t = torch.squeeze(torch.bmm(alpha_t, enc_hiddens), dim=1)
        # u_t = [a_t;h_t^{dec}]
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        # v_t = W_uu_t
        V_t = self.combined_output_projection(U_t)
        # o_t = Dropout(Tanh(v_t))
        O_t = self.dropout(torch.tanh(V_t))

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            print(prev_hyp_ids)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Beispiel #11
0
class NMT(nn.Module):
    """ 基于注意力机制的seq2seq神经机器转换模型:
        - 双向 LSTM Encoder
        - 单向 LSTM Decoder
        - 全局注意力模型
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ 初始化 NMT 模型.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): 词总述,包括 src 和 tgt
        @param dropout_rate (float): 对注意力的dropout概率
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # 初始化各层次
        # LSTM层 输入词嵌入,输出隐藏状态
        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               dropout=self.dropout_rate,
                               bidirectional=True)  # 可以选择双向
        # LSTMCell 输入词嵌入与隐藏状态连接,输出隐藏状态
        self.decoder = nn.LSTMCell(embed_size + self.hidden_size,
                                   self.hidden_size)  # 可以控制每个时间步
        self.h_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)  # 降维2h->h
        self.c_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)  # 降维2h->h
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)  # 降维2h->h
        self.combined_output_projection = nn.Linear(self.hidden_size * 3,
                                                    self.hidden_size,
                                                    bias=False)  # 降维3h->h
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)  # 输出投影到词库
        self.dropout = nn.Dropout(p=self.dropout_rate)

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ 取一个mini-batch的源句子和目标句子, 在NMT系统下学习的语言模型,计算目标句子的似然对数

        @param source (List[List[str]]): 源句子列表
        @param target (List[List[str]]): 目标句子列表, 被 `<s>` 和 `</s>` 包裹

        @returns scores (Tensor): 形状 (b, ) 的变量或张量,表示对输入的batch的每个例子,标准目标句子的似然对数
                                  这里 b = batch size.
        """

        # 计算每个源句子的长度
        source_lengths = [len(s) for s in source]

        # 把列表转换成向量
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        # 运行网络的前向传播
        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # 生成掩码,让目标文本中生成标记<pad>的概率归零化
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # 计算生成真实的目标文本的概率对数
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)  # 求和
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ 在源句子上应用encoder来得到encoder隐藏状态
            然后,取出encoder最后的状态,把他们投影成decoder初始状态
        @param source_padded (Tensor): 形状 (src_len, b) 的填充好的源句子的张量,
                                        b = batch_size, src_len = 源句子的最大长度.
                                        已按照最长到最短长度排序
        @param source_lengths (List[int]): batch 中每个源句子的实际长度列表

        @returns enc_hiddens (Tensor): 形状 (b, src_len, h*2) 的隐藏单元张量,
                                        b = batch size, src_len = 源句子的最大长度, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): 表示 decoder 的初始隐藏状态和细胞状态的张量元组
        """
        enc_hiddens, dec_init_state = None, None

        X = self.model_embeddings.source(source_padded)  # 输入经过embedding层
        X = pack_padded_sequence(X, source_lengths)  # 应用pack_padded_sequence
        enc_hiddens, (last_hidden, last_cell) = self.encoder(
            X)  # 经过encoder层,得到每个时间步的隐藏状态和最后时间步的隐藏状态和细胞状态
        enc_hiddens = pad_packed_sequence(
            enc_hiddens, batch_first=True)[0]  # (b, src_len, h*2)
        # 连接Encoder得到的前向和反向的隐藏状态or细胞状态,然后用线性层初始化Decoder隐藏状态or细胞状态
        init_decoder_hidden = self.h_projection(
            torch.cat((last_hidden[0], last_hidden[1]), dim=1))
        init_decoder_cell = self.c_projection(
            torch.cat((last_cell[0], last_cell[1]), dim=1))
        dec_init_state = (init_decoder_hidden, init_decoder_cell)  # 形成元组

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """对每个 batch 计算连接的输出向量
        @param enc_hiddens (Tensor): 隐藏状态 (b, src_len, h*2), b = batch size, src_len = 源句子的最大长度, h = hidden size.
        @param enc_masks (Tensor): 句子掩码张量 (b, src_len), b = batch size, src_len = 源句子的最大长度.
        @param dec_init_state (tuple(Tensor, Tensor)): deocder 初始的隐藏状态和细胞状态
        @param target_padded (Tensor): 标准填充好的目标句子 (tgt_len, b), tgt_len = 目标句子的最大长度, b = batch size.

        @returns combined_outputs (Tensor): 连接输出的张量  (tgt_len, b,  h), tgt_len = 目标句子的最大长度, b = batch_size,  h = hidden size
        """
        # 在最大长度的句子中去掉<END>标识
        target_padded = target_padded[:-1]

        # 初始化解码器状态(隐藏和细胞)
        dec_state = dec_init_state

        # 初始化上一步连接好的输出向量 o_0为零向量
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # 初始化一个列表,用于收集每一个时间步连接好的输出向量 o_t
        combined_outputs = []

        enc_hiddens_proj = self.att_projection(
            enc_hiddens)  # (b, src_len, h)   计算 W_attProj·h_enc
        Y = self.model_embeddings.target(target_padded)  # (tgt_len, b, e)
        for Y_t in torch.split(Y, 1):  # (1, b, e) 在0维度上拆分Y,每个大小为1
            Y_t = torch.squeeze(Y_t, dim=0)  # (b, e)
            Ybar_t = torch.cat((Y_t, o_prev), dim=1)  # (b, e+h)
            dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens,
                                            enc_hiddens_proj, enc_masks)  # 计算
            combined_outputs.append(o_t)
            o_prev = o_t  # 上一步输出的指针指向本次输出的 o_t
        combined_outputs = torch.stack(
            combined_outputs, dim=0)  # (b, h)->(tgt_len, b, h) 堆积成目标形状

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ 计算 LSTM decoder 的每个前向步, 包括注意力计算.

        @param Ybar_t (Tensor): 连接好的张量  [Y_t o_prev], 形状 (b, e + h). decoder 的输入
                                b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): 张量元组 形状都为 (b, h), b = batch size, h = hidden size.
                第一个张量是 decoder 的先前的隐藏状态, 第二个张量是 decoder 的先前的细胞状态.
        @param enc_hiddens (Tensor): Encoder 隐藏状态张量, 形状 (b, src_len, h * 2),
                                    b = batch size, src_len = 源的最大长度, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder 隐藏状态张量, 从 (h * 2) 投影成 h. 张量形状 (b, src_len, h),
                                        b = batch size, src_len = 源的最大长度, h = hidden size.
        @param enc_masks (Tensor): 句子掩码张量,形状 (b, src_len),
                                    b = batch size, src_len = 源的最大长度.

        @returns dec_state (tuple (Tensor, Tensor)): 张量元组 两个张量形状都为 (b, h), b = batch size, h = hidden size.
                第一个张量是 decoder 的新隐藏状态, 第二个张量是 decoder 的新细胞状态.
        @returns combined_output (Tensor): 第t步连接的输出张量, 形状 (b, h), b = batch size, h = hidden size.
        @returns e_t (Tensor): 张量,形状 (b, src_len). 注意力分数分布.
                                这个函数之外不会使用到。
        """

        combined_output = None

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state  # 分开成两部分
        # (b, src_len) = (b, src_len, h)·(b, h) -> (b, src_len, 1) = (b, src_len, h)·(b, h, 1)
        e_t = torch.bmm(enc_hiddens_proj,
                        torch.unsqueeze(dec_hidden, dim=2))  # (b, src_len, 1)
        e_t = torch.squeeze(e_t, dim=2)  # (b, src_len)

        # 把掩码为1的e_t置为负无穷
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        # 计算
        alpha_t = F.softmax(e_t, dim=1)  # (b, src_len)
        alpha_t = torch.unsqueeze(alpha_t, dim=1)  # (b, 1, src_len)
        a_t = torch.bmm(
            alpha_t,
            enc_hiddens)  # (b, 1, h * 2) = (b, 1, src_len)·(b, src_len, h * 2)
        a_t = torch.squeeze(a_t, dim=1)  # (b, 2h)
        U_t = torch.cat((a_t, dec_hidden), dim=1)  # (b, 3h)
        V_t = self.combined_output_projection(U_t)  # (b, h)
        O_t = self.dropout(torch.tanh(V_t))  # (b, h)

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ 对 encoder 隐藏状态生成句子掩码

        @param enc_hiddens (Tensor): 需要编码的张量,形状 (b, src_len, 2*h), b = batch size,
                                     src_len = 源的最大长度, h = hidden size.
        @param source_lengths (List[int]): batch 中每个句子的实际长度.

        @returns enc_masks (Tensor): 句子掩码的张量,形状 (b, src_len),
                                    src_len = 源的最大长度, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)  # 掩码矩阵
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1  # 超过真实句子长度的地方置为1,如<pad>标记
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ 给定单个源句子, 运行 beam search, 生成目标形式的结果.
        @param src_sent (List[str]): 一个源句子(词)
        @param beam_size (int): beam size,候选数
        @param max_decoding_time_step (int): 展开解码 RNN 的最大时间步
        @returns hypotheses (List[Hypothesis]): 假设列表, 每个假设有两个域:
                value: List[str]: 解码的目标句子, 用词序列表示
                score: float: 目标句子的对数似然
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)  # (1,)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)  # 候选句数量

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)
            # att_t形状(b, h)
            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # 目标文字的概率对数
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)  # (词表长,)

            live_hyp_num = beam_size - len(
                completed_hypotheses)  # beam_size - 完成的句子数
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(
                    -1)  # (候选句数,词表长)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)  # (候选句数,k)

            prev_hyp_ids = top_cand_hyp_pos / len(
                self.vocab.tgt)  # 前序候选词id矩阵(候选句数,k)
            hyp_word_ids = top_cand_hyp_pos % len(
                self.vocab.tgt)  # 候选词id矩阵(候选句数,k)

            new_hypotheses = []  # 新候选句子
            live_hyp_ids = []  # 剩余候选id
            new_hyp_scores = []  # 新候选句分数
            # 按照k的数量迭代更新候选句
            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':  # 完成
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:  # 候选项达最大值
                break

            # 更新下一个状态
            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:  # 候选项为空
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score,
                                  reverse=True)  # 得分降序

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ 决定使用CPU或GPU去放置张量.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ 从文件中加载模型.
        @param model_path (str): 模型路径
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ 保存模型到文件.
        @param path (str): 模型路径
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()  # 包含lr学习率
        }

        torch.save(params, path)
Beispiel #12
0
class DPPNMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self,
                 embed_size=256,
                 hidden_size=256,
                 vocab=None,
                 dropout_rate=0.2,
                 nmt_model=None):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(DPPNMT, self).__init__()
        if nmt_model is not None:
            self.model_embeddings = nmt_model.model_embeddings
            self.hidden_size = nmt_model.hidden_size
            self.dropout_rate = nmt_model.dropout_rate
            self.vocab = nmt_model.vocab

            self.encoder = nmt_model.encoder
            self.decoder = nmt_model.decoder
            self.h_projection = nmt_model.h_projection
            self.c_projection = nmt_model.c_projection
            self.att_projection = nmt_model.att_projection
            self.combined_output_projection = nmt_model.combined_output_projection
            self.target_vocab_projection = nmt_model.target_vocab_projection
            self.dropout = nmt_model.dropout

        else:
            self.model_embeddings = ModelEmbeddings(embed_size, vocab)
            self.hidden_size = hidden_size
            self.dropout_rate = dropout_rate
            self.vocab = vocab

            # default values
            self.encoder = None
            self.decoder = None
            self.h_projection = None
            self.c_projection = None
            self.att_projection = None
            self.combined_output_projection = None
            self.target_vocab_projection = None
            self.dropout = None

            ### YOUR CODE HERE (~8 Lines)
            ### TODO - Initialize the following variables:
            ###     self.encoder (Bidirectional LSTM with bias)
            ###     self.decoder (LSTM Cell with bias)
            ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
            ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
            ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
            ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
            ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
            ###     self.dropout (Dropout Layer)
            ###
            ### Use the following docs to properly initialize these variables:
            ###     LSTM:
            ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
            ###     LSTM Cell:
            ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
            ###     Linear Layer:
            ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
            ###     Dropout Layer:
            ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
            self.encoder = nn.LSTM(input_size=embed_size,
                                   hidden_size=hidden_size,
                                   bias=True,
                                   bidirectional=True)
            self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size,
                                       hidden_size=hidden_size,
                                       bias=True)
            self.h_projection = nn.Linear(in_features=2 * hidden_size,
                                          out_features=hidden_size,
                                          bias=False)
            self.c_projection = nn.Linear(in_features=2 * hidden_size,
                                          out_features=hidden_size,
                                          bias=False)
            self.att_projection = nn.Linear(in_features=2 * hidden_size,
                                            out_features=hidden_size,
                                            bias=False)
            self.combined_output_projection = nn.Linear(
                in_features=3 * hidden_size,
                out_features=hidden_size,
                bias=False)
            self.target_vocab_projection = nn.Linear(in_features=hidden_size,
                                                     out_features=len(
                                                         vocab.tgt),
                                                     bias=False)
            self.dropout = nn.Dropout(p=dropout_rate)
            ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute
        X = self.model_embeddings.source(source_padded)

        enc_hiddens, (last_hidden, last_cell) = self.encoder(
            torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths))
        enc_hiddens, _ = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens)
        enc_hiddens = enc_hiddens.permute(1, 0, 2)

        last_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1)
        init_decoder_hidden = self.h_projection(last_hidden)
        last_cell = torch.cat((last_cell[0], last_cell[1]), dim=1)
        init_decoder_cell = self.c_projection(last_cell)

        dec_init_state = (init_decoder_hidden, init_decoder_cell)
        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev.
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack
        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        for Y_t in torch.split(Y, 1):
            Y_t = torch.squeeze(Y_t, dim=0)
            Ybar_t = torch.cat((Y_t, o_prev), dim=1)
            dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens,
                                          enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t
        combined_outputs = torch.stack(combined_outputs)
        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t.
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state
        e_t = torch.squeeze(torch.bmm(enc_hiddens_proj,
                                      torch.unsqueeze(dec_hidden, dim=2)),
                            dim=2)
        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        #$$     Hints:
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh

        alpha_t = nn.functional.softmax(e_t, dim=1)
        a_t = torch.squeeze(torch.bmm(torch.unsqueeze(alpha_t, dim=1),
                                      enc_hiddens),
                            dim=1)
        U_t = torch.cat((a_t, dec_hidden), dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))
        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0

        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            # (num_hyps x target_embed_size)
            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)

            ###### START TOP K HERE #######
            # top_cand_hyp_scores, top_cand_hyp_pos = self.topk(contiuating_hyp_scores, live_hyp_num)
            ###### END TOP K HERE #######

            ###### START DPP HERE #######
            top_cand_hyp_scores, top_cand_hyp_pos = self.kdpp(
                att_t,
                src_encodings,
                src_encodings_att_linear,
                h_t,
                cell_t,
                contiuating_hyp_scores,
                live_hyp_num,
                beam_size,
            )
            #### END DPP HERE ####

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        if PRINT_HYPOTHESES:
            print(completed_hypotheses)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        nmt_model = NMT(vocab=params['vocab'], **args)
        nmt_model.load_state_dict(params['state_dict'])
        model = DPPNMT(nmt_model=nmt_model)

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)

    def timer(self, message=None):
        if PRINT_TIMER:
            if message is None or not hasattr(
                    self, "last_time") or self.last_time is None:
                self.last_time = time.time()
            else:
                new_time = time.time()
                print("%s: %f" % (message, new_time - self.last_time))
                self.last_time = new_time

    def topk(self, contiuating_hyp_scores, live_hyp_num):
        top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
            contiuating_hyp_scores, k=live_hyp_num)
        return top_cand_hyp_scores, top_cand_hyp_pos

    def word_embeddings(self):
        if not hasattr(self, "word_embeddings_cached"):
            self.timer()
            word_ids = torch.tensor([
                self.vocab.tgt[self.vocab.tgt.id2word[id]]
                for id in range(len(self.vocab.tgt))
            ],
                                    dtype=torch.long,
                                    device=self.device)
            self.word_embeddings_cached = embeddings = self.model_embeddings.target(
                word_ids)
            if TOGGLE_PRINT:
                print("embeddings", embeddings.shape)
            self.timer("Embeddings")
        return self.word_embeddings_cached

    def kdpp(self, att_t, src_encodings, src_encodings_att_linear, h_t, cell_t,
             contiuating_hyp_scores, live_hyp_num, beam_size):
        # for every element in contiuating_hyp_scores, I need to get the target
        # word embedding, take another step, get that output, normalize, and multiply by
        # the corresponding element of log_p_t
        # TODO: need to duplicate each num_hyps times
        self.timer()
        top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
            contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE_RATIO * beam_size)
        self.timer("topk")
        vocab_size = len(self.vocab.tgt.word2id)
        num_hyps, embed_size = att_t.shape
        # TODO: minimize data movement
        # print("x", x.shape)
        # att_t_repeated = att_t.repeat(1, vocab_size).view(-1, embed_size)
        # embeddings_repeated = embeddings.repeat(1, vocab_size).view(-1, embed_size)
        # x = torch.cat([embeddings_repeated, att_t_repeated], dim=-1)
        # x = x[top_cand_hyp_pos]
        embeddings = self.word_embeddings()
        # print(top_cand_hyp_pos)
        x_list = []
        for hyp_pos in top_cand_hyp_pos:
            emb_hyp = embeddings[hyp_pos % vocab_size]
            att_hyp = att_t[hyp_pos / vocab_size]
            x_partial = torch.cat([emb_hyp, att_hyp])
            x_list.append(x_partial.unsqueeze(0))
        x = torch.cat(x_list, dim=0)
        self.timer("newx")

        batch_size = x.shape[0]
        new_exp_src_encodings = src_encodings.expand(batch_size,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

        new_exp_src_encodings_att_linear = src_encodings_att_linear.expand(
            batch_size, src_encodings_att_linear.size(1),
            src_encodings_att_linear.size(2))

        # Might have to stretch h_t, and cell_t
        # new_h_t = h_t.repeat(1, vocab_size).view(-1, embed_size)
        # new_cell_t = cell_t.repeat(1, vocab_size).view(-1, embed_size)
        # new_h_t = new_h_t[top_cand_hyp_pos]
        # new_cell_t = new_cell_t[top_cand_hyp_pos]

        self.timer()
        new_h_t_list = []
        new_cell_t_list = []
        for hyp_pos in top_cand_hyp_pos:
            h_t_hyp = h_t[hyp_pos / vocab_size]
            cell_t_hyp = cell_t[hyp_pos / vocab_size]
            new_h_t_list.append(h_t_hyp.unsqueeze(0))
            new_cell_t_list.append(cell_t_hyp.unsqueeze(0))
        new_h_t = torch.cat(new_h_t_list, dim=0)
        new_cell_t = torch.cat(new_cell_t_list, dim=0)
        self.timer("new_h_t/cell_t")

        (h_t_dpp, _), _, _ = self.step(x, (new_h_t, new_cell_t),
                                       new_exp_src_encodings,
                                       new_exp_src_encodings_att_linear,
                                       enc_masks=None)
        self.timer("step")
        # num_hyps = len(contiuating_hyp_scores.shape[0])/len(self.vocab.tgt)

        norms = torch.norm(h_t_dpp, p=2, dim=1, keepdim=True)
        if norms.is_cuda:
            norms = norms.cpu()
        unit_vectors = h_t_dpp.div(norms.expand_as(h_t_dpp))
        # new_p_t = log_p_t.repeat(1, vocab_size).view(-1, vocab_size)
        # print("new_p_t", log_p_t.shape)
        # TODO: this returns e^{scores}... correct?
        quality_scores = torch.exp(
            top_cand_hyp_scores.unsqueeze(1)).expand_as(unit_vectors)
        # TODO: maybe normalize the quality_scores?
        quality_scores = torch.pow(quality_scores, 1 / 2)
        features = unit_vectors * quality_scores
        self.timer("scores")
        L = torch.mm(features, features.t())
        self.timer("L")

        try:
            new_top_cand_hyp_pos = sample_k_dpp(L, k=live_hyp_num)
        except Exception as e:
            print("Error sampling from L, falling back to top k: %s" % e)
            return self.topk(contiuating_hyp_scores, live_hyp_num)

        if ADD_TOP_N > 0:
            new_top_cand_hyp_pos = np.unique(
                np.append(new_top_cand_hyp_pos, range(ADD_TOP_N)))

        self.timer("sample_k_dpp")
        top_cand_hyp_pos = top_cand_hyp_pos[new_top_cand_hyp_pos]
        # top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos].squeeze(0)
        top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos]

        scores1, pos1 = self.topk(contiuating_hyp_scores, live_hyp_num)
        # print('topk pos', pos1)
        # print('top_cand_hyp_pos', top_cand_hyp_pos)
        # print('topk scores', scores1)
        # print('top_cand_hyp_pos', top_cand_hyp_scores)

        if TOGGLE_PRINT:
            print("vocab size", vocab_size)
            print("att_t_repeated", att_t_repeated.shape)
            print("top_cand_hyp_pos", top_cand_hyp_pos.shape)
            print("new_x", x.shape)
            print("src_encodings", new_exp_src_encodings.shape)
            print("src_encodings_att", new_exp_src_encodings_att_linear.shape)
            print("new_h_t", new_h_t.shape)
            print("new_cell_t", new_cell_t.shape)
            print("hidden", h_t_dpp.shape)
            print("norms", norms.shape)
            print("unit_vectors", unit_vectors.shape)
            print("L", L.shape)
            print("L", L)
            print("new_top_cand_hyp_pos", new_top_cand_hyp_pos)
            print(top_cand_hyp_pos)
            print("new_top_hyp_pos", top_cand_hyp_pos.shape)
            print("new_top_hyp_scores", top_cand_hyp_scores.shape)
            print('top chosen: ', new_top_cand_hyp_pos)

        return top_cand_hyp_scores, top_cand_hyp_pos