class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(embed_size, self.hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) #possibly wrong self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) #possibly wrongc self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute X = self.model_embeddings.source(source_padded) packed = nn.utils.rnn.pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(packed) enc_hiddens, _ = nn.utils.rnn.pad_packed_sequence(enc_hiddens, batch_first=True) #print(last_hidden.shape, torch.cat((last_hidden[0], last_hidden[1]), 1).shape) init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), 1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), 1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop off the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack #print(enc_hiddens.shape) enc_hiddens_proj = self.att_projection(enc_hiddens) #print(enc_hiddens_proj.shape) Y = self.model_embeddings.target(target_padded) Y_split = torch.split(Y, 1, dim=0) for Y_t in Y_split: Y_t = torch.squeeze(Y_t, 0) Ybar_t = torch.cat((Y_t, o_prev), 1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t #print(Y_t.shape, o_prev.shape, Ybar_t.shape) combined_outputs = torch.stack(combined_outputs) ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state #print(enc_hiddens_proj.shape, torch.unsqueeze(dec_hidden, 2).shape) e_t = torch.squeeze( torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2)), 2) #print(e_t.shape) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. # $$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = nn.Softmax(dim=1)(e_t) a_t = torch.squeeze( torch.bmm(torch.unsqueeze(alpha_t, 1), enc_hiddens), 1) #print(dec_hidden.shape, a_t.shape) U_t = torch.cat((a_t, dec_hidden), 1) v_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(v_t)) #print(O_t.shape) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class QGModel(nn.Module): def __init__(self, vocab, embed_size, hidden_size, enc_bidir, attn_size, dropout=0.2): super(QGModel, self).__init__() self.vocab = vocab self.args = { 'embed_size': embed_size, 'hidden_size': hidden_size, 'dropout': dropout, 'enc_bidir': enc_bidir, 'attn_size': attn_size } self.embeddings = ModelEmbeddings(embed_size, vocab) self.encoder = Encoder(embed_size, hidden_size, dropout, enc_bidir) self.decoder_init_hidden_proj = nn.Linear(self.encoder.hidden_size, hidden_size) self.decoder = Decoder(embed_size, hidden_size, attn_size, len(vocab.tgt), dropout) def batch_to_tensor(self, source, target): # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) source_mask = self.generate_mask(source_lengths, source_padded.shape[0]) return source_padded, target_padded, source_lengths, source_mask def forward(self, source: List[List[str]], target: List[List[str]]): source_padded, target_padded, source_lengths, source_mask = self.batch_to_tensor( source, target) source_embedding = self.embeddings.source( source_padded) # (src_len, b, embed_size) target_embedding = self.embeddings.target( target_padded) # (tgt_len, B, embed_size) memory, last_hidden = self.encoder(source_embedding, source_lengths) # last_hidden: (B, hidden) memory = memory.transpose(0, 1) # memory: (B, src_len, hidden) dec_init_hidden = torch.tanh( self.decoder_init_hidden_proj(last_hidden)) gen_output = self.decoder(memory, source_mask, target_embedding, dec_init_hidden) # (tgt_len - 1, B, word_vocab_size), not probability P = F.log_softmax(gen_output, dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def generate_mask(self, length, max_length): mask = torch.zeros(len(length), max_length, dtype=torch.int, device=self.device) for i, x in enumerate(length): mask[i, x:] = 1 return mask def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70): """ :param batch: batch size is 1 :param beam_size: :return: """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_len = torch.tensor([len(src_sent)], dtype=torch.int, device=self.device) source_embedding = self.embeddings.source( src_sents_var) # (src_len, b, embed_size) memory, last_hidden = self.encoder(source_embedding, src_len) # last_hidden: (B, hidden) memory = memory.transpose(0, 1) # memory: (B, src_len, hidden) dec_init_hidden = torch.tanh( self.decoder_init_hidden_proj(last_hidden)) # (B, hidden) hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 ctxt_tm1 = torch.zeros(len(hypotheses), self.args['hidden_size'], device=self.device) dec_hidden_tm1 = dec_init_hidden while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) prev_word = torch.tensor( [self.vocab.tgt[x[-1]] for x in hypotheses], dtype=torch.long, device=self.device) tgt_tm1 = self.embeddings.target(prev_word) # (B, word_embed_size) memory_tm1 = memory.expand((hyp_num, *memory.shape[1:])) gen_t, dec_hidden_t, ctxt_t = self.decoder.decode_step( tgt_tm1, ctxt_tm1, dec_hidden_tm1, memory_tm1) gen_t = torch.log_softmax(gen_t, dim=-1) # (B, vocab) live_hyp_num = beam_size - len(completed_hypotheses) continuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(gen_t) + gen_t).view( -1) # (hyp_num * V) top_candi_scores, top_candi_position = torch.topk( continuating_hyp_scores, k=live_hyp_num) prev_hyp_indexes = top_candi_position / len(self.vocab.tgt) hyp_word_indexes = top_candi_position % len(self.vocab.tgt) new_hypothesis = [] live_hyp_index = [] new_hyp_scores = [] num_unk = 0 for prev_hyp_index, hyp_word_index, new_hyp_score in zip( prev_hyp_indexes, hyp_word_indexes, top_candi_scores): prev_hyp_index = prev_hyp_index.item() hyp_word_index = hyp_word_index.item() new_hyp_score = new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_index] new_hypo = hypotheses[prev_hyp_index] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hypo[1:-1], score=new_hyp_score)) else: new_hypothesis.append(new_hypo) live_hyp_index.append(prev_hyp_index) new_hyp_scores.append(new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_index = torch.tensor(live_hyp_index, dtype=torch.long, device=self.device) dec_hidden_tm1 = dec_hidden_tm1[live_hyp_index] ctxt_tm1 = ctxt_t[live_hyp_index] hypotheses = new_hypothesis hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) has_comp = True if len(completed_hypotheses) == 0: has_comp = False completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda x: x.score, reverse=True) return completed_hypotheses, has_comp @property def device(self): return self.decoder_init_hidden_proj.weight.device def save(self, path): path = path + ".qg" dir = Path(path).parent dir.mkdir(parents=True, exist_ok=True) state_dict = {} state_dict['vocab'] = self.vocab state_dict['args'] = self.args state_dict['model_state'] = self.state_dict() torch.save(state_dict, path) @staticmethod def load(path, device): params = torch.load(path, map_location=device) model = QGModel(vocab=params['vocab'], **params['args']) # type:nn.Module model.load_state_dict(params['model_state']) return model.to(device)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### ### YOUR CODE HERE (~8 Lines) ### self.embed_size = embed_size self.encoder = nn.LSTM(self.embed_size, self.hidden_size, bias=True, bidirectional=True) # do i have to do make self.embed_size? Also I think since bidirectional is specified I don't need to use 2*self.hidden_size self.decoder = nn.LSTMCell(self.hidden_size + embed_size, self.hidden_size, bias=True) # need input size, hidden size. I think they are the same, except that for the input you concatenate the embedding for the current word. self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_h self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_c self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_attProj Not sure about this one; it seems to actually take two inputs, h^dec_t to the left and h^enc_i to the right. self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False) # W_u self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # W_vocab. Is len(self.vocab.tgt) the length of the target vocab? that is what we want. self.dropout = nn.Dropout(self.dropout_rate) #Dropout layer. ### ### END YOUR CODE ### '''TODO - Initialize the following variables: self.encoder (Bidirectional LSTM with bias) self.decoder (LSTM Cell with bias) self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. self.dropout (Dropout Layer) Use the following docs to properly initialize these variables: LSTM: https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM LSTM Cell: https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell Linear Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Linear Dropout Layer: https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout ''' def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze( -1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[ torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### ### YOUR CODE HERE (~ 8 Lines) ### X = self.model_embeddings.source(source_padded) #I think that's all it is: the embedding object looks up the embeddings for you (see Hwk3_parser_model) Xpacked = nn.utils.rnn.pack_padded_sequence(X, source_lengths) # needs sequence lengths enc_hiddens, (last_hidden, last_cell) = self.encoder(Xpacked) # three birds with one stone enc_hiddens = nn.utils.rnn.pad_packed_sequence(enc_hiddens) # output is a tuple containing the desired tensor at 0 and source_lengths at 1. We only want the former enc_hiddens = enc_hiddens[0].permute(1, 0, 2) # we have to swap the first 2 indices. use permute() last_hidden = last_hidden.split(1, 0) #split along the first dimension to yield a tuple last_hidden = torch.cat(last_hidden, 2) #concatenate along the third dimension last_hidden = last_hidden.squeeze() #squeeze out the singleton dimension init_decoder_hidden = self.h_projection(last_hidden) #apply h_projection layer last_cell = last_cell.split(1, 0) # same drill as with last_hidden. Can I just do last_cell = torch.cat((last_cell[0, :, :], last_cell[1, :, :]), 1)? last_cell = torch.cat(last_cell, 2) last_cell = last_cell.squeeze() init_decoder_cell = self.c_projection(last_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### ### END YOUR CODE ### ''' TODO: 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. src_len = maximum source sentence length, b = batch size, e = embedding size. Note that there is no initial hidden state or cell for the decoder. 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): - `init_decoder_hidden`: `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). Apply the h_projection layer to this in order to compute init_decoder_hidden. This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size - `init_decoder_cell`: `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). Apply the c_projection layer to this in order to compute init_decoder_cell. This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size See the following docs, as you may need to use some of the following functions in your implementation: Pack the padded sequence X before passing to the encoder: https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence Pad the packed sequence, enc_hiddens, returned by the encoder: https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence Tensor Concatenation: https://pytorch.org/docs/stable/torch.html#torch.cat Tensor Permute: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute ''' return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop off the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### ### YOUR CODE HERE (~9 Lines) ### enc_hiddens_proj = self.att_projection(enc_hiddens) #output should be shape (batch size, max source length, hidden size) Y = self.model_embeddings.target(target_padded) for Y_t in Y.split(1, 0): #'iterate over the time dimension of Y'? which one is time? must be tgt_len since that's the one the sentences unfold through Y_t = Y_t.squeeze(0) #squeeze the first dimension Ybar_t = torch.cat((Y_t, o_prev), 1) #concatenate Y_t with o_prev (has to be perpendicular to batch_size axis, the only one they have same length) dec_state, combined_output, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) #use 'step' function to get next decoder cell, state and output combined_outputs.append(combined_output) #append o_t (combined_output) to combined_outputs o_prev = combined_output #update o_prev to the new o_t combined_outputs = torch.stack(combined_outputs) #Use torch.stack to convert combined_outputs from a list of length tgt_len of (batch_size, hidden_size) to single (tgt_len, batch_size, hidden_size) tensor ### ### END YOUR CODE ### ''' TODO: 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, which should be shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. This is applying W_{attProj} to h^enc, as described in the PDF. 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. where tgt_len = maximum target sentence length, b = batch size, e = embedding size. 3. Use the torch.split function to iterate over the time dimension of Y. Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. - Squeeze Y_t into a tensor of dimension (b, e). - Construct Ybar_t by concatenating Y_t with o_prev. - Use the step function to compute the the Decoder's next (cell, state) values as well as the new combined output o_t. - Append o_t to combined_outputs - Update o_prev to the new o_t. 4. Use torch.stack to convert combined_outputs from a list length tgt_len of tensors shape (b, h), to a single tensor shape (tgt_len, b, h) where tgt_len = maximum target sentence length, b = batch size, h = hidden size. Note: - When using the squeeze() function make sure to specify the dimension you want to squeeze over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. Use the following docs to implement this functionality: Zeros Tensor: https://pytorch.org/docs/stable/torch.html#torch.zeros Tensor Splitting (iteration): https://pytorch.org/docs/stable/torch.html#torch.split Tensor Dimension Squeezing: https://pytorch.org/docs/stable/torch.html#torch.squeeze Tensor Concatenation: https://pytorch.org/docs/stable/torch.html#torch.cat Tensor Stacking: https://pytorch.org/docs/stable/torch.html#torch.stack ''' return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### ### YOUR CODE HERE (~3 Lines) ### dec_state = self.decoder(Ybar_t, dec_state) #apply decoder unit (not decode() function) to Ybar_t and dec_state, yielding new dec_state (dec_state contains both hidden and cell states) #print(combined_output) #cols not same dec_cell = dec_state[1]; dec_hidden = dec_state[0] #split dec_state into dec_hidden and dec_cell. Should use torch.split()? Should not matter as dec_state is just a tuple #print(dec_state) #ALL COLUMNS SAME ''' Form of BMM operation: (b, n, m)*(b, m, p) --> (b, n, p). With unsqueeze, we have input sizes dec_hidden: (b, 1, h), enc_hiddens_proj: (b, src_len, h). and output size e_t: (b, src_len) e_t must come out (b, 1, src_len). So let b = b, n = 1, m = h, and p = src_len. So permute last 2 elements of enc_hiddens_proj before squeeze. ''' enc_hiddens_proj = enc_hiddens_proj.permute(0, 2, 1) e_t = torch.bmm(torch.unsqueeze(dec_hidden, 1), enc_hiddens_proj) #get attention scores e_t using BMM e_t = e_t.squeeze(1) #remove added dimension #print(e_t) #cols not same ### ### END YOUR CODE ### ''' TODO: 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. 2. Split dec_state into its two parts (dec_hidden, dec_cell) 3. Compute the attention scores e_t, a Tensor shape (b, src_len). Note: b = batch_size, src_len = maximum source length, h = hidden size. Hints: - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). - Use batched matrix multiplication (torch.bmm) to compute e_t. - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. - When using the squeeze() function make sure to specify the dimension you want to squeeze over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. Use the following docs to implement this functionality: Batch Multiplication: https://pytorch.org/docs/stable/torch.html#torch.bmm Tensor Unsqueeze: https://pytorch.org/docs/stable/torch.html#torch.unsqueeze Tensor Squeeze: https://pytorch.org/docs/stable/torch.html#torch.squeeze ''' # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### ### YOUR CODE HERE (~6 Lines) ### alpha_t = F.softmax(e_t)#, dim=0) a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens) ''' Form of BMM operation: (b, n, m)*(b, m, p) --> (b, n, p). We have input sizes alpha_t: (b, 1, src_len), enc_hiddens: (b, src_len, 2h). and output size a_t: (b, 2h) a_t must come out (b, 1, 2h). So let b = b, n = 1, m = h, and p = src_len. No permute needed, just squeeze second dimension. ''' a_t = a_t.squeeze(1) U_t = torch.cat((dec_hidden, a_t), 1) # for autograde they say use [dec_hidden, a_t]. dec_hidden is shape (b, h), a_t is (b, 2h), so the cat must be along the '1' dimension V_t = self.combined_output_projection(U_t) tanh = nn.Tanh() V_t = tanh(V_t) O_t = self.dropout(V_t) #ALL SANITY CHECKS PASSED... though i feel insane now ### ### END YOUR CODE ### ''' TODO: 1. Apply softmax to e_t to yield alpha_t 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the attention output vector, a_t. Hints: - alpha_t is shape (b, src_len) - enc_hiddens is shape (b, src_len, 2h) - a_t should be shape (b, 2h) - You will need to do some squeezing and unsqueezing. Note: b = batch size, src_len = maximum source length, h = hidden size. 3. Concatenate dec_hidden with a_t to compute tensor U_t 4. Apply the combined output projection layer to U_t to compute tensor V_t 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. Use the following docs to implement this functionality: Softmax: https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax Batch Multiplication: https://pytorch.org/docs/stable/torch.html#torch.bmm Tensor View: https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view Tensor Concatenation: https://pytorch.org/docs/stable/torch.html#torch.cat Tanh: https://pytorch.org/docs/stable/torch.html#torch.tanh ''' combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[ Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM( embed_size, self.hidden_size, bidirectional=True, batch_first=True ) #lstm的参数(emdding_size,hidden_size,) 注意,时间序列不属于网络架构 self.decoder = nn.LSTM(embed_size + self.hidden_size, self.hidden_size, batch_first=True) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt)) self.dropout = nn.Dropout(p=self.dropout_rate) ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) #(1, len(tgt_vocab)) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # print('P.shape: {}'.format(P.shape),'\ntget_padded.shape: {}'.format(target_padded.shape), '\nrget_masks.shape: {}'.format(target_masks.shape)) # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P.squeeze(dim=2), index=target_padded[1:].unsqueeze(-1), dim=-1 ).squeeze(-1) * target_masks[1:] #(sequence=1, len(tgt_vocab)) scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute x = self.model_embeddings.source(source_padded).permute( 1, 0, 2) #permute前:(inputshape(scr_len,batch_size),embedding_size) x = pack_padded_sequence( x, source_lengths, batch_first=True) #打包(batch,len,embdding_size) enc_hiddens, (last_hidden, last_cell) = self.encoder( x) #x,output(batch, seq, feature) ???这里是否要写入变量hidden和cell呢 enc_hiddens = pad_packed_sequence( enc_hiddens, batch_first=True)[0] #所有的hidden 都在这里了 last_hidden = torch.cat((last_hidden[0, :, :], last_hidden[1, :, :]), 1) init_decoder_hidden = self.h_projection(last_hidden).unsqueeze(0) last_cell = torch.cat((last_cell[0, :, :], last_cell[1, :, :]), 1) init_decoder_cell = self.c_projection(last_cell).unsqueeze(0) dec_init_state = [init_decoder_hidden, init_decoder_cell] ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack enc_hiddens_proj = self.att_projection( enc_hiddens) #再点乘dec_hidden得到attention score y = self.model_embeddings.target(target_padded).permute( 1, 0, 2) #(batch, sequence, embdding_size) for t in range(len(y[1])): y_t = torch.split(y, 1, dim=1)[t] #一个词一个词丢进去(batch, 1, embedding_size) y_t = torch.squeeze(y_t, dim=1) #(batch, embedding_size_) ybar_t = torch.cat((y_t, o_prev), 1) #(batch_size, embedding_size + hidden_size) dec_state, combined_output, e_t = self.step( ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(combined_output) o_prev = combined_output combined_outputs = torch.stack(combined_outputs, dim=0).unsqueeze(dim=2) #本来是一个列表,每一项都是大小相同的tensor,代表每个词的输出,将他在第0维堆起来(sequence, batch, hidden_size) ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze #Ybar_t(batch, feature) output, dec_state = self.decoder( Ybar_t.unsqueeze(dim=1), dec_state) #输入Ybar_t必须是三个维(batch, seq, feature) dec_hidden, dec_cell = dec_state[0], dec_state[ 1] #size:(sequence = 1, batch, hidden_size) dec_state = (dec_state[0].permute(1, 0, 2), dec_state[1].permute(1, 0, 2)) e_t = torch.bmm( enc_hiddens_proj, dec_hidden.squeeze(dim=0).unsqueeze(dim=2)).squeeze( 2 ) # enc_hiddens_proj:(batch_size, encode_sequence, hidden_size) #要得到dec_hidden:(batch, hidden_size, 1),先移除1 再增加1 ,e_t:(batch, encode_sequence, 1) ### END YOUR CODE # Set e_t to -inf (最大下界)where enc_masks has 1, 使pad token处得到的attention score最小,以至于使注意力不会放在这些地方 #如果全部设置为零,会使长短句的结果出现偏差.短句的attention output会小一些,具体怎么偏差,我也不知道阿 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.to(torch.bool), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = F.softmax(e_t, dim=1).unsqueeze( 1) #(batch, decoder_sequence=1, encoder_sequence) a_t = torch.bmm( alpha_t, enc_hiddens ) #(batch, 1, sequence).*(batch, sequence, hidden*2) = (batch, 1, hidden*2) u_t = torch.cat((a_t, dec_hidden.permute((1, 0, 2))), dim=2) v_t = self.combined_output_projection(u_t) O_t = torch.tanh(v_t) O_t = self.dropout(O_t) #(batch, sequence=1. hidden) ### END YOUR CODE combined_output = O_t.squeeze(dim=1) #希望的输出是(batch, hidden_size) return dec_state, combined_output, e_t #原来写的e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) #source sentence 转化为张量 src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) #encoding src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec #decode 的hidden 和 cell # 报错,转换一下hidden 和 cell 的维度 #h_tm1 = (h_tm1[0].permute((1, 0, 2)), h_tm1[1].permute((1, 0, 2))) att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) #attention 参数(1,h) eos_id = self.vocab.tgt['</s>'] #start hypotheses = [['<s>']] #用来存每一步预测的字符 hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) #初始化每个预测字符的分数为0 #y_tm1 = torch.zeros([1, len(hypotheses)], dtype=torch.long, device=self.device) completed_hypotheses = [] #确定的结果 t = 0 while len( completed_hypotheses ) < beam_size and t < max_decoding_time_step: #每一步:当确定的的字符的数量小于beamz_size t += 1 hyp_num = len(hypotheses) #当前预测的数量 exp_src_encodings = src_encodings.expand( hyp_num, src_encodings.size(1), src_encodings.size(2) ) #expand(len(hypotheses), len(sequence)=26, 2*hiddend_siz=512) #取前len(预测数量)个 enconding的参数 exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) # 取前len(预测数量)个attention的参数 y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) #存入预测的目标词的张量 y_t_embed = self.model_embeddings.target(y_tm1) #词嵌入 x = torch.cat([y_t_embed, att_tm1], dim=-1) #目标词embedding 和 attention 结合 (h_t, cell_t), att_t, _ = self.step( x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None ) #对x即预测词进行处理,得到attention ouput 和 target hiddend state # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len( completed_hypotheses) #算一下还剩多少个可以预测的位置 contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view( -1) #将每个分数转化为与log同大小,再相加(len(hyp)) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) ###topk:Returns the k largest elements of the given input tensor along a given dimension.返回value和indices prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) #每个词/总词汇量 hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] #loop算每个词 for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] #得到预测的词 new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word ] #将这个词加入到预测的句子中 if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) #如果这个词是开始词,说明已经完事了,然后就将所有的词和分数加入完整的预测中 else: #对新的预测进行一系列的append new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor( live_hyp_ids, dtype=torch.long, device=self.device) #将预测词的indices 转化为张量 h_tm1_1 = h_t[live_hyp_ids].permute(1, 0, 2) #hidden_state cell_t_1 = cell_t[live_hyp_ids].permute(1, 0, 2) c = ((h_tm1_1, cell_t_1)) #import pdb #pdb.set_trace() h_tm1 = c att_tm1 = att_t[live_hyp_ids] #更新 attention output hypotheses = new_hypotheses #更新预测 hyp_scores = new_hyp_scores hyp_scores = torch.tensor(hyp_scores, dtype=torch.float, device=self.device) #将预测的分数存为张量 if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) #加入开始词 completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) #对所有结果的分数进行排序 return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=1, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size, hidden_size=hidden_size, bias=True) self.h_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.c_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.att_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.combined_output_projection = \ nn.Linear(in_features=3 * hidden_size, out_features=hidden_size, bias=False) self.target_vocab_projection = \ nn.Linear(in_features=hidden_size, out_features=len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling # `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling # `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling # `self.decode()` ### 4. Compute log probability distribution over the target # vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) \ -> \ Tuple[ torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ X = self.model_embeddings.source(source_padded) packedX = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(packedX) enc_hiddens, _ = pad_packed_sequence(enc_hiddens, batch_first=True) last_hidden_bx2h = torch.cat(torch.unbind(last_hidden, dim=0), dim=1) init_decoder_hidden = self.h_projection(last_hidden_bx2h) last_cell_bx2h = torch.cat(torch.unbind(last_cell, dim=0), dim=1) init_cell_hidden = self.c_projection(last_cell_bx2h) dec_init_state = (init_decoder_hidden, init_cell_hidden) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings.target(target_padded) for Y_t in torch.split(Y, 1, dim=0): Y_t = torch.squeeze(Y_t) Ybar_t = torch.cat((Y_t, o_prev), dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t return torch.stack(combined_outputs) def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state dec_hidden = torch.unsqueeze(dec_hidden, dim=1) enc_hiddens_proj = enc_hiddens_proj.permute(0, 2, 1) e_t = torch.bmm(dec_hidden, enc_hiddens_proj) e_t = torch.squeeze(e_t, dim=1) dec_hidden = torch.squeeze(dec_hidden, dim=1) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) alpha_t = (nn.Softmax(dim=0))(e_t) alpha_t = torch.unsqueeze(alpha_t, dim=1) a_t = torch.bmm(alpha_t, enc_hiddens) a_t = torch.squeeze(a_t, dim=1) U_t = torch.cat((dec_hidden, a_t), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len( completed_hypotheses) < beam_size and t < \ max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout ################################################################################################################################################################### ############ ### Step 1: Initiate the entire Encoder network ### ############ # nn.LSTM is a multi-layer long short-term memory (LSTM) RNN # input_size: The number of expected features in the input # hidden_size: The number of features in the hidden state # no need to specify number of time-steps # For the Encoder network, we use nn.LSTM because we only want the output of the entire network # Encoder network # Input of each bidirectional LSTM is the word embedding vector (shape 1 x e) # Output of each bidirectional LSTM is the hidden state and cell state of each LSTM (shape 1 x h) # The forward and backward LSTM outputs will be further concatenated resulting in shape (1 x 2h). The concatenation will be done seperately. self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bias=True, bidirectional=True) ############ ### Step 2: Initiate the structure of a LSTMCell for the Decoder network ### ############ # nn.LSTMCell is a single long short-term memory (LSTM) cell (single time-step) # input_size: The number of expected features in the input # hidden_size: The number of features in the hidden state # For the Decoder network, we use nn.LSTMCell because need to compute the output prediction at each time-step (LSTM cell) # A LSTMCell for the Decoder network # The input will be the concatenation of the output vector from the previous LSTM time-step (shape 1 x h) and the input word embedding vector (shape 1 x e) at current step # The output will be the hidden state and cell state (both shape 1 x h in PDF) self.decoder = nn.LSTMCell(input_size=hidden_size + embed_size, hidden_size=hidden_size, bias=True) ############ ### Step 3: Initiate the Decoder network's first hidden state and cell state ### ############ # We initiate the Decoder network's first hidden state and cell state with a linear projection (no activation) of the Encoder's final hidden state and final cell state # Linear projection means no activation, just multiplied by the weight matrix # Layer input: Concatenated bidirection hidden/cell state vector of the last layer of the Encoder network (1 x 2h) # W_{h/c}: Linear layer below (shape 2h x h) # Layer output: First hidden/cell state of Decoder network (shape 1 x h) self.h_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.c_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) ############ ### Step 4: Initiate the linear Attention Project Layer ### ############ # We implement multiplicative attention (lecture slide# 78) through a linear layer # Layer input: the hidden state vector of Encoder network h_{enc} (shape src_len x 2h) # Layer output will be further multiplied by the hidden state of one Decoder time-step h_{dec}.T (shape 1 x h) # The multiplication result e is the attention score vector (size m x 1 in PDF) # Layer input h_{enc}: shape (src_len x 2h) # W_{att_projection}: Linear layer below (shape 2h x h) # Layer output: shape (src_len x h), which is a “liner projection” of the hidden state vector of the entire Encoding network self.att_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) ############ ### Step 5: Initiate the Attention Output Layer ### ############ ## A softmax activation is not initialized here but will be done when the model object is built to normalize attention distribution for current Decoder LSTM step. The output of the softmax function is denoted as alpha_t in PDF (shape 1 x src_len) ## ## The output of the softmax function (shape 1 x src_len) will be multiplied by all the hidden states of the Encoder LSTM steps (shape src_len x 2h) and the result is the attention output for the current Decoder LSTM step (shape 1 x 2h)## # We concatenate the attention output for the current Decoder LSTM step (shape 1 x 2h) and hidden state output of the current LSTM step in Decoder network (shape 1 x h) and run the result (shape 1 x 3h) through a linear layer to get the output vector of the current Decoder LSTM step # Layer input: concatenated attention output vector and hidden state of current Decoder time-step (shape 1 x 3h) # W_{u}: Linear layer below (shape 3h x h) # Layer output: V_t, almost the output of one Decoder time-step (shape 1 x h), still need to go through a couple of processes to be the final output of one Decoder time-step self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) ############ ### Step 6: Initiate Dropout ### ############ self.dropout = nn.Dropout(p=self.dropout_rate) ############ ### Step 7: Initiate a linear layer before the final softmax function ### ############ # Then, we produce a probability distribution over target words at the current Decoder LSTM step through a softmax function # The softmax function is not initialized here but will be done when the model object is built # The output vector should go through a linear layer below before the softmax activation # Layer input: o_t, almost the output of one Decoder time-step (shape 1 x h) # W_{vocab}: Linear layer below (shape h x len(vocab.tgt)) # Layer output: final prediction of one Decoder time-step (shape 1 x len(vocab.tgt)) -- one hot vector of the predicted word self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) ################################################################################################################################################################### ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute ################################################################################################################################################################### ############ ### Step 1: Get word embeddings for the source sentences ### ############ # nn.Embedding: # Input: torch.LongTensor (number of indices to extract per mini-batch, mini-batch size) # Output: (number of indices to extract per mini-batch, mini-batch size, embedding_dim) # source_padded: shape (max_seq_len, batch_size) = (src_len, b) # X: shape (max_seq_len, batch_size, embedding_dim) = (src_len, b, e) # Notice here the change between input and output of nn.Embedding is just the additional word embedding in the last dimension # X is essential the word embedding vectors of all Encoder time-steps for one mini-batch # X will act as the input to the Encoder network, self.encoder = nn.LSTM() defined in __init__() X = self.model_embeddings.source(source_padded) ############ ### Step 2: Feed word embeddings into Encoding network to get output, last hidden state, and last cell state of Encoder network ### ############ # pack_padded_sequence: https://github.com/HarshTrivedi/packing-unpacking-pytorch-minimal-tutorial # Remove padding from X so that when we later feed it into RNN, the paddings will not be computed as input to LSTM steps # packed_input.data.shape: (unpadded_sum_seq_len, embedding_dim) packed_input = pack_padded_sequence(X, torch.Tensor(source_lengths)) # LSTM: https://stackoverflow.com/questions/48302810/whats-the-difference-between-hidden-and-output-in-pytorch-lstm # Inputs: input, (h_0, c_0) # If (h_0, c_0) is not provided, both h_0 and c_0 default to zero. # Outputs: output, (h_n, c_n) # h_n: shape (num_layers * num_directions, batch, hidden_dim) = (1 * 2, b, h) # c_n: shape (num_layers * num_directions, batch, hidden_dim) = (1 * 2, b, h) # last_hidden: shape (2, b, h) # last_cell: shape (2, b, h) # packed_output.data.shape : (unpadded_sum_seq_len, hidden_dim) packed_output, (last_hidden, last_cell) = self.encoder(packed_input) ############ ### Step 3: Post dimensionality processing of output of Encoder network: enc_hiddens ### ############ # Unpack output to gain padding # enc_hiddens.shape : (max_seq_len, batch_size, hidden_dim) = (src_len, b, 2h) enc_hiddens, _ = pad_packed_sequence(packed_output) # convert it to shape (batch_size, max_seq_len, hidden_dim) = (b, src_len, 2h) enc_hiddens = enc_hiddens.transpose(0, 1) ############ ### Step 4: Post dimensionality processing of last hidden state, and last cell state of Encoder network: last_hidden, last_cell ### ############ # last_hidden and last_cell have shape (2, b, h). The 0-dim has number 2, meaning that it contains the hidden state or cell state of both directions. We need to manually concatenate them # convert it to shape (batch_size, hidden_dim) # torch.cat(tensors, dim=0, out=None) → Tensor # dim: the dimension over which the tensors are concatenated # last_hidden: shape (2, b, h) # last_hidden[0], last_hidden[1]: shape (b, h) # output: shape (b, 2h) # Essentially, we are concatenating the two directional hidden states of the Encoder network and concatenating the two directional cell states of the Encoder network last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) last_cell = torch.cat((last_cell[0], last_cell[1]), 1) ############ ### Step 5: Compute the Decoder network's first hidden state and cell state with a linear projection of the Encoder's final hidden state and final cell state ### ############ # last_hidden: shape (b, 2h) # init_decoder_hidden: shape (b, h) init_decoder_hidden = self.h_projection(last_hidden) init_decoder_cell = self.c_projection(last_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) ################################################################################################################################################################### ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack ################################################################################################################################################################### ############ ### Step 1: Feed Encoder hidden state into the attention projection layer to obtain the attention output ### ############ # enc_hiddens_proj will be used for computing attention scores in later steps # enc_hiddens: shape (b, src_len, 2h) # W_{attProj}: shape (2h, h) # enc_hiddens_proj = enc_hiddens · W_{attProj}: shape (b, src_len, h) enc_hiddens_proj = self.att_projection(enc_hiddens) ############ ### Step 2: Get word embeddings for the target sentences for the Decoder network ### ############ # Y: shape (tgt_len, b, e) Y = self.model_embeddings.target(target_padded) ############ ### Step 3: Step through the time steps in the Decoder network ### ############ # tensor.split() example: https://blog.csdn.net/weixin_44613063/article/details/89576810 # tensor.split(size of each piece, dimension to be split) # here we split Y's 0-dim into pieces of size 1 for Y_t in Y.split(1, dim=0): ############ ### Step 3.1: Remove redundant dimension from Y_t ### ############ # torch.squeeze() example: https://jamesmccaffrey.wordpress.com/2019/07/02/the-pytorch-view-reshape-squeeze-and-flatten-functions/ # In some sense a dimension with size 1 is useless. The squeeze() function eliminate any dimension that has size 1 # We can also pass the argument to specify which dimension to squeeze # Y_t is the word embedding vectors of a mini-bath at one time-step # Y_t: shape (1, b, e) → Y_t_squeezed: shape (b, e) Y_t_squeezed = Y_t.squeeze(dim=0) ############ ### Step 3.2: Concatenate the word embedding input at current time-step and the predicted output at previous time-step to be the LSTM input of current time-step ### ############ # torch.cat() example: https://blog.csdn.net/weixin_44613063/article/details/89576810 # torch.cat(tensors to concatenate, dimension to concatenate) → Tensor # o_prev: the predicted output at previous Decoder time-step # o_prev was initialized as shape (b, h) with torch.zeros() # Y_t_squeezed: the word embedding input at current time-step, shape (b, e) # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h) Ybar_t = torch.cat((Y_t_squeezed, o_prev), dim=1) ############ ### Step 3.3: Compute one forward step of the LSTM decoder, including the attention computation to get output and new hidden state at current time-step ### ############ dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) ############ ### Step 3.4: Update the Decoder output vector (that contains output of all Decoder time-steps) to incldue the output of the current time-step ### ############ # combined_outputs is a list that stores Decoder outputs. when we finish each Decoder time-step, we append the output to it # o_t: output at current Decoder time-step, shape (b, h) combined_outputs.append(o_t) ############ ### Step 3.5: Update variable that stores the output of previous time-step ### ############ o_prev = o_t ############ ### Step 4: Reshape the Decoder output vector combined_outputs to (tgt_len, b, h) ### ############ # before stacking, the length of 0-dim of combined_outputs is the number of time-steps in Decoder network combined_outputs = torch.stack(combined_outputs, dim=0) ################################################################################################################################################################### ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ################################################################################################################################################################### ############ ### Step 1: Apply the input (concatenation of word embedding input at current time-step and output at previous time-step) into Decoder LSTMCell to get new output at current time-step ### ############ # LSTMCell # Inputs: input, (h_0, c_0) # Outputs: (h_1, c_1) # Ybar_t: concatenated LSTM input of current time-step, shape (b, e+h) # dec_state as input contains both hidden state and cell state, hidden state and cell state both are shape (b, h) # dec_state as output: shape (2, b, h) dec_state = self.decoder(Ybar_t, dec_state) ############ ### Step 2: Split dec_state into its two parts (dec_hidden, dec_cell) ### ############ # dec_hidden, dec_cell: shape (b, h) (dec_hidden, dec_cell) = dec_state ############ ### Step 3: Compute attention score vector for the current time-step ### ############ # We multiply the hidden state vector “projection” of the entire Encoding network by the hidden state of the current time-step in Decoder network to get the attention score vector for the current time-step # enc_hiddens_proj: shape (b, src_len, h) # dec_hidden: shape (b, h) # torch.unsqueeze(dec_hidden, 2): shape (b, h, 1) # torch.bmm(input, mat2, out=None) → Tensor # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor. # enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)): shape (b, src_len, 1) # e_t: shape (b, src_len) # e_t contains the attentions score of each time-step in Encoding network on the current one time-step in Decoder network e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) ################################################################################################################################################################### ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh ################################################################################################################################################################### ############ ### Step 1: Compute attention distribution alpha_t for the current time-step ### ############ # Softmax converts all attentions scores into values between [0, 1] and add up to 1 # e_t: shape (b, src_len) # alpha_t: shape (b, src_len) alpha_t = F.softmax(e_t, dim=1) ############ ### Step 2: Compute attention output a_t for the current time-step ### ############ # We multiply the attention distribution vector by the hidden state vector of the entire Encoding network to get the attention output for the current time-step in Decoder network # torch.bmm(input, mat2, out=None) → Tensor # If input is a (b×n×m) tensor, mat2 is a (b×m×p) tensor, out will be a (b×n×p) tensor. # alpha_t: shape (b, src_len) # alpha_t.unsqueeze(1): shape (b, 1, src_len) # enc_hiddens: shape (b, src_len, 2h) # alpha_t.unsqueeze(1).bmm(enc_hiddens): shape (b, 1, 2h) # a_t: shape (b, 2h) a_t = alpha_t.unsqueeze(1).bmm(enc_hiddens).squeeze(1) ############ ### Step 3: Concatenate attention output a_t with the hidden state of current Decoder time-step ### ############ # U_t contains information from both the hidden state of current Decoder time-step and the attention from the Encoder network # dec_hidden: shape (b, h) # U_t: shape (b, 3h) U_t = torch.cat((a_t, dec_hidden), dim=1) ############ ### Step 4: We pass the concatenated result through a linear layer ### ############ V_t = self.combined_output_projection(U_t) ############ ### Step 5: We apply tanh activation for the linear layer output and apply dropout to obtain the combined output vector O_t ### ############ O_t = self.dropout(torch.tanh(V_t)) ################################################################################################################################################################### ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, src_embed_size, dst_embed_size, hidden_size, vocab, dropout_rate=0.2, use_pos_embed=False, use_copy=False, max_src_len=48): """ Init NMT Model. @param src_embed_size (int): encoder Embedding size (dimensionality) @param dst_embed_size (int): decoder Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(src_embed_size, dst_embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.use_pos_embed = use_pos_embed self.use_copy = use_copy self.max_src_len = max_src_len # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### self.encoder = nn.LSTM(src_embed_size, hidden_size, num_layers=1, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(dst_embed_size+hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) if not use_pos_embed else \ nn.Linear(2 * hidden_size+dst_embed_size, hidden_size, bias=False) if use_pos_embed: self.combined_output_projection = nn.Linear(3 * hidden_size+dst_embed_size, hidden_size, bias=False) elif use_copy: self.combined_output_projection = nn.Linear(3 * hidden_size + max_src_len, hidden_size, bias=False) else: self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) # self.device = torch.device('cpu') def forward(self, source: torch.Tensor, source_lengths: List[int], target: torch.Tensor) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (Tensor): padded source sentences with shape (src_len, b) @param source_lengths (List[int]): list of source sentence lengths @param target (Tensor): padded target sentences with shape (src_len, b) @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ source_padded = source target_padded = target src_len = source.size(0) batch_size = source.size(1) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text # target_masks = (target_padded != self.vocab.tgt['<pad>']).float() #dst_pad_token_idx target_masks = (target_padded != self.vocab.dst_pad_token_idx).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze( -1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[ torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### #convert sentences to indexes X = self.model_embeddings.source(source_padded) input = torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(input) enc_hiddens, _ = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens, batch_first=True) init_decoder_hidden = self.h_projection(torch.cat([last_hidden[0], last_hidden[1]], 1)) init_decoder_cell = self.c_projection(torch.cat([last_cell[0], last_cell[1]], 1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) src_len = enc_hiddens.size(1) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### combined_inputs = enc_hiddens if self.use_pos_embed: input_pos_embed = self.create_input_pos_embed(src_len, batch_size) combined_inputs = torch.cat((enc_hiddens, input_pos_embed), dim=2) enc_hiddens_proj = self.att_projection(combined_inputs) else: enc_hiddens_proj = self.att_projection(enc_hiddens) # print("Device is: {}".format(self.device)) # target_indexes = None # if torch.cuda.is_available(): # target_indexes = torch.cuda.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist())).to(self.device) # else: # target_indexes = torch.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist())).to(self.device) # target_indexes = torch.LongTensor(self.vocab.tgt.words2indices(target_padded.tolist())) # Y = self.model_embeddings.target(target_indexes) Y = self.model_embeddings.target(target_padded) for Y_t in torch.split(Y, 1, dim=0): Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat([Y_t, o_prev], dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, combined_inputs, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, dim=0) return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor (+ input_pos_embed if use_pos_embed), with shape (b, src_len, h * 2 (+ dsb_embed_size)), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### dec_hidden, dec_cell = self.decoder(Ybar_t, dec_state) dec_state = (dec_hidden, dec_cell) e_t = torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2)) e_t = torch.squeeze(e_t, 2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. soft_max = torch.nn.Softmax(1) alpha_t = soft_max(e_t) a_t = torch.bmm(torch.unsqueeze(alpha_t, 1), enc_hiddens) a_t = torch.squeeze(a_t, 1) # U_t = torch.cat([a_t, dec_hidden], 1) if self.use_copy: U_t = torch.cat([dec_hidden, a_t], 1) src_len = alpha_t.size(1) if src_len == self.max_src_len: alpha_t_ext = alpha_t else: batch_size = alpha_t.size(0) ext = torch.zeros(batch_size, (self.max_src_len - src_len)).to(self.device) alpha_t_ext = torch.cat([alpha_t, ext], 1) U_t = torch.cat([dec_hidden, a_t, alpha_t_ext], 1) # attention based copy else: U_t = torch.cat([dec_hidden, a_t], 1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: torch.Tensor, src_sent_len: int, beam_size: int = 5, max_decoding_time_step: int = 30) -> List[ Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (Tensor): a single source sentence (words) with shape (sentence len, ) @param src_sent_len int; src_sent length: int @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ # src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_sents_var = torch.unsqueeze(src_sent, 1) # src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings, dec_init_vec = self.encode(src_sents_var, [src_sent_len]) # src_encodings_att_linear = None combined_inputs = src_encodings if self.use_pos_embed: input_pos_embed = self.create_input_pos_embed(src_sent_len, 1) combined_inputs = torch.cat((src_encodings, input_pos_embed), dim=2) src_encodings_att_linear = self.att_projection(combined_inputs) else: src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) # eos_id = self.vocab.tgt['</s>'] eos_id = self.vocab.dst_eos_token_idx hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = combined_inputs.expand(hyp_num, combined_inputs.size(1), combined_inputs.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor([self.vocab.tgt.stoi[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.itos[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str, use_pos_embed: bool, use_copy: bool): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], use_pos_embed=use_pos_embed, use_copy=use_copy, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(src_embed_size=self.model_embeddings.src_embed_size, dst_embed_size=self.model_embeddings.dst_embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def load_pretrained_embeddings(self, vocab: Vocab): self.model_embeddings.source.weight.data.copy_(vocab.src.vectors) def create_input_pos_embed(self, src_len, batch_size): pos_list = [[self.vocab.tgt.stoi[str(i)] for i in range(0, src_len)]] input_positions = torch.LongTensor(pos_list).to(self.device) # [1, src_len] input_positions = input_positions.repeat(batch_size, 1) # [ batch_size, src_len] input_embed = self.model_embeddings.target(input_positions) return input_embed # [ batch_size, src_len, dst_embed_size]
class TransformerNMT(nn.Module): def __init__(self, vocab, embed_size=512, num_hidden_layers = 6, num_attention_heads = 8, fc_size = 2048, dropout_rate = 0.1): super(TransformerNMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.pos_encoder = PositionalEncoding(embed_size, dropout_rate) self.vocab = vocab self.device = None self.d_model = embed_size self.encoder = torch.nn.TransformerEncoder( torch.nn.TransformerEncoderLayer( d_model = embed_size, nhead = num_attention_heads, dim_feedforward=fc_size, dropout=dropout_rate ), num_hidden_layers ) self.decoder = torch.nn.TransformerDecoder( torch.nn.TransformerDecoderLayer( d_model = embed_size, nhead = num_attention_heads, dim_feedforward=fc_size, dropout=dropout_rate ), num_hidden_layers ) self.tgt_mask = None self.target_vocab_projection = nn.Linear(embed_size, len(vocab.tgt)) def _generate_square_subsequent_mask(self, sz): mask = (torch.triu(torch.ones(sz, sz)) == 1).transpose(0, 1) mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0)) return mask def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: source_lengths = [len(s) for s in source] source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) # X: [seq_length, batch, embed_size] X = self.model_embeddings.source(source_padded) * math.sqrt(self.d_model) X = self.pos_encoder(X) Y = self.model_embeddings.target(target_padded) * math.sqrt(self.d_model) Y = self.pos_encoder(Y) tgt_key_padding_masks = (target_padded == self.vocab.tgt['<pad>']).T if self.tgt_mask is None or self.tgt_mask.size(0) != len(Y): self.tgt_mask = self._generate_square_subsequent_mask(len(Y)).to(self.device) # (self.tgt_mask[:]) # print(self.tgt_mask) src_padding_masks = (source_padded == self.vocab.src['<pad>']).T # memory, enc_attn = self.encoder(source_padded, attention_mask=src_padding_masks) # memory: [t, batch_size, embed_n] memory = self.encoder(X) # memory[1] += 1 # print(memory[:2]) # Y[1] += 1 output = self.decoder( tgt=Y, memory=memory, memory_key_padding_mask=src_padding_masks, tgt_mask=self.tgt_mask, tgt_key_padding_mask=tgt_key_padding_masks) # print(memory.shape) # tgt_padding_masks = (target_padded != self.vocab.tgt['<pad>']) # output, _, dec_attn = self.decoder(target_padded, attention_mask=tgt_padding_masks, encoder_hidden_states= memory) # print(output.shape) # print(output[:3, 0]) # print(output.shape) # print(output[0,:, :20]) # output = self.pos_encoder(output) tgt_padding_masks = (target_padded != self.vocab.tgt['<pad>']).float() # print(tgt_padding_masks.shape) # print(self.target_vocab_projection(output).shape) out_probs = F.log_softmax(self.target_vocab_projection(output), dim=-1)#.permute(1, 0, 2) # print(out_probs.shape) # print(target_padded.shape) # print(target_padded.T[1:].unsqueeze(-1).shape) # print(torch.gather(out_probs[:-1], index=target_padded.T[1:].unsqueeze(-1), dim=-1)) target_gold_words_log_prob = torch.gather(out_probs[:-1], index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * tgt_padding_masks[1:] # print() # print(target_padded[:2]) # print(out_probs.shape) # print(out_probs[:, :, 3]) # print(out_probs[:, :, 1503]) # print(target_gold_words_log_prob.T) # print(target_padded) # print(out_probs.argmax(-1).T) scores = target_gold_words_log_prob.sum(dim=0) return scores # encoded = self.encode(source_padded) # enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) # combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) # P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # # Zero out, probabilities for which we have nothing in the target text # target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # # Compute log probability of generating true target words # target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] # scores = target_gold_words_log_prob.sum(dim=0) # return scores def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def to(self, device): super().to(device) self.device = device return self
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias); # our input size is based around the number of features * embeddings usually, but slightly different here # for LSTM we are inputting a sentence of size m # each word in the sentence is converted to an embedding of size e # this embedding of size e is then fed in sequentially, so input size is just size of embedding. # I think we just want one layer for the encoder, which is default # also need to add dropout self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, num_layers=1, bias=True, bidirectional=True) ### self.decoder (LSTM Cell with bias) # decoder is initialized with a linear projection of hidden state & cell state from encoder... # from what I am reading the output of encoder should be a single vector rather than a series out outputs # specifically, our input to decoder is just the final hidden state of encoder # we are also going to be feeding in the associated word embedding....which is of size e. # so we concatenate the final hidden state of encoder + word embedding e.... # I am making a guess dimensions of hidden layer size for decoder is still hidden size input to class.... # helpful reading: https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html self.decoder = nn.LSTMCell(input_size=self.hidden_size + embed_size, hidden_size=self.hidden_size, bias=True) ### What are these linear layers actually doing? # These are building a fully connected layer # finding a way to convert encoder's final hidden state to proper dimensions for decoder hidden state? # Same with the cell states. ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. # Applies a linear transformation to the incoming data: y = xA^T + b # Based on equation 3 in assignment I think input is 2 * hidden_size, output is hidden_size self.h_projection = nn.Linear(in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False) ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ## same as above: equation 4 shows that this reducing size self.c_projection = nn.Linear(in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False) #### Attention is happening after decoder input # input to attention is hidden_encoding, which is 2 * hidden size. output is m * 1.... ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. # equations 7 - 9 show whats happening here...... # equation 7 shows theinput to attention projection is h_enc which is of size 2 * hidden_size # equation 8 shows the output of vector h x 2h for W_attproj self.att_projection = nn.Linear(in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False) ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. # equation 10 - 11 show us the size here: # we concatenate attention output + decoder hidden state; output is size hidden state self.combined_output_projection = nn.Linear( in_features=3 * self.hidden_size, out_features=self.hidden_size, bias=False) ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. # W_vocab is Vt x h, so hidden input, Vt length utput # the vocab.tgt is going to spit out entry for target language..source could be used also\ # should be equivalent to model_embeddings weight shape....one of the aspects of that matrix. self.target_vocab_projection = nn.Linear(in_features=self.hidden_size, out_features=len( self.vocab.tgt), bias=False) ### self.dropout (Dropout Layer) # equation 12: dropout weight vector is h *1, so input is 1, and output is hidden_size self.dropout = nn.Dropout(dropout_rate) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.dropout = nn.Dropout(dropout_rate) ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. # model embeddings were initialized as 'self.model_embeddings', just need to read in & specify source # looking at other solution it looks like I need to pass in the source_padded....I am not super clear on this # Here is my interpretation: # - We create an instance of ModelEmbeddings during __init__ based on our vocab & expected embedding size # - However, we have not yet passed any of the info in yet # - We limit to our instance.source, which is an instance of nn.embedding: https://pytorch.org/docs/stable/nn.html#torch.nn.Embedding # - We then need to pass in our tensor of padded source sentences, which are (src_len, b) # - The embedding size (e) is already built into our nn.embedding instance, so output will account for this # - Final shape will be (src_len, b, e) X = self.model_embeddings.source(source_padded) ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. # start by padding X: https://pytorch.org/cppdocs/api/function_namespacetorch_1_1nn_1_1utils_1_1rnn_1a6c14a90e57eb631f51f06e52a600f7f7.html # expects an input matrix along with lengths of each sentence X = pack_padded_sequence(X, source_lengths) # now we can apply the encoder: self.encoder # this is our instance of the LSTM for encoder, defined during __init__ # returns: output, (hidden, cell) # (last_hidden, last_cell); unpacking for now since step 3 seems to need this enc_hiddens, (last_hidden, last_cell) = self.encoder(X) # finally, we need to apply to pad_packed_sequence function to enc_hideens # https://pytorch.org/cppdocs/api/function_namespacetorch_1_1nn_1_1utils_1_1rnn_1aea13e54d273ec16fa8288e2262c28f30.html # https://stackoverflow.com/questions/51030782/why-do-we-pack-the-sequences-in-pytorch # great overview above - in short its going to help us make more computationally efficient moves # The returned Tensor’s data will be of size T x B x *, where T is the length of the longest sequence and B is the batchsize. # we don't need to keep the tensor of lengths, only need to actual tensor data enc_hiddens, _ = pad_packed_sequence(enc_hiddens) ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### permute makes it easy to just rearrange the order of data: ## we have: (src_len, b, h*2) and we want: (b, src_len, h*2).... ## x.permute(1, 0, 2) enc_hiddens = enc_hiddens.permute(1, 0, 2) ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### # we want to go from (2,b,h) -> (b, 2*h); can do this with concatenation # I believe we want dim = 1 to stack cols cat_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) # now we pass this value into our h_projection layer init_decoder_hidden = self.h_projection(cat_hidden) # compute last cell, again: we want to go from (2,b,h) -> (b, 2*h) cat_cell = torch.cat((last_cell[0], last_cell[1]), 1) # Apply the c_projection layer to this in order to compute init_decoder_cell. init_decoder_cell = self.c_projection(cat_cell) # make a tuple dec_init_state = (init_decoder_hidden, init_decoder_cell) ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len= maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ #### David's interpretation of this step: # We first remove the end token (for training? # initialize the decoder state as the init state, which is the final hidden / cell state of the Encoder # the o_prev is just going to represent combined-output vector, which is the concatenation of the attention + decoder hidden state # Note: in the next step we will use decoder hidden state to calculate the attention -- happens in `.step()` # for a target sentence in our batch we will iterate through each target word, going through and: # - passing in the word embeddings for target # - getting updated decoder hidden state & attention vectors # - outputting the o_prev, which will eventually be passed through to build out the probability distribution over target words # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero --> we start with 0s batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. enc_hiddens_proj = self.att_projection(enc_hiddens) ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. # we need to convert the target language sentencesinto embeddings. Y = self.model_embeddings.target(target_padded) ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. # going to separate ot tgt_len, b, e -> size of (1,b,e) for each word in the sentence (example in my testing notebook) for Y_t in torch.split(Y, 1): # squeeze Y_t into a tensor of dimension (b,e) --> this just removes the 1 in shape # For example, if input is of shape: (A \times 1 \times B \times C \times 1 \times D)(A×1×B×C×1×D) then the out tensor will be of shape: (A \times B \times C \times D)(A×B×C×D) . Y_t = torch.squeeze(Y_t) # Construct Ybar_t by concatenating Y_t with o_prev. # assuming we just dimension 1 again - will need to confirm Ybar_t = torch.cat((Y_t, o_prev), 1) # Use the step function to compute the the Decoder's next (cell, state) values # as well as the new combined output o_t. # e_t looks like it won't actually be used? just added for debugging # we wil get out updated dec_state (used in next iteration) as well as updated o_t, which is stored dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) # Append o_t to combined_outputs combined_outputs.append(o_t) # Update o_prev to the new o_t. o_prev = o_t ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. # onto the torch.stack process # this can take a list as input and output a stacked tensor...unclear on how to handle dimensions here so using default. combined_outputs = torch.stack(combined_outputs) ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ # The goal of this section is to build out the decoder hidden state as well as the attention vector combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. # the decoder is an instance of 'LSTMCell' -> outputs h_1 (hidden state) , c_1 (cell state) # i think I can just stored in `dec_state`, which will be (hidden_state, cell_state) dec_state = self.decoder(Ybar_t, dec_state) ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) dec_hidden, dec_cell = dec_state ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### # Attention equation is broken down in assignment via equations 7 - 9 # Equation 7: # we start by taking dec_hidden to compute multuplicative attention over each hidden unit from encode ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. # If input is a (b \times n \times m)(b×n×m) tensor, mat2 is a (b \times m \times p)(b×m×p) tensor, out will be a (b \times n \times p)(b×n×p) tensor. # dec_hidden: (b * h) # enc_hiddens_proj: (b, src_len, h) ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### # we want dec_hidden to be: (b * h * 1) ---> so must unsqueeze at 3 dimension (referenced as 2) dec_hidden_un = torch.unsqueeze( dec_hidden, 2) # should move us from (b*h) -> (b*h*1) # enc_hiddens is okay at: (b * src_len * h) # we want output e_t to be: (b, src_len), meaning we need to first get: (b, src_len, 1) # this would be (b * src_len * h) * (b * h * 1?) # can then squeeze output e_t = torch.bmm(enc_hiddens_proj, dec_hidden_un) # e_t shape is (b, src_len, 1) so need to squeeze to remove last output at the third dimension (referenced as 2 e_t = torch.squeeze(e_t, 2) ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### This is picking back up with the Equation 8 ### TODO: ### 1. Apply softmax to e_t to yield alpha_t: Equation 8 in pdf alpha_t = F.softmax(e_t, dim=1) ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. - Equation 9 in the pdf #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### # we need to get a_t as our output into (b,2h) # need alpha to go from (b,src_len) -> (b, 1, src_len) alpha_t_u = torch.unsqueeze(alpha_t, 1) # bmm a_t = torch.bmm(alpha_t_u, enc_hiddens) # we then need to unsqueeze output to go from b, 1, 2h -> b, 2h a_t = torch.squeeze(a_t, 1) ### 3. Concatenate dec_hidden with a_t to compute tensor U_t: Equation 10 U_t = torch.cat((a_t, dec_hidden), 1) ### 4. Apply the combined output projection layer to U_t to compute tensor V_t: Equation 11 V_t = self.combined_output_projection(U_t) ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer: equation 12 O_t = self.dropout(torch.tanh(V_t)) ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # Bidirectional LSTM with bias self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, bidirectional=True, bias=True) # LSTM Cell with bias self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size, hidden_size=self.hidden_size, bias=True) # Linear Layer with no bias, W_{h} self.h_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{c} self.c_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{attProj} self.att_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{u} self.combined_output_projection = nn.Linear( in_features=self.hidden_size * 3, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{vocab} self.target_vocab_projection = nn.Linear(in_features=self.hidden_size, out_features=len(vocab.tgt), bias=False) # Dropout Layer self.dropout = nn.Dropout(p=self.dropout_rate) def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### step 1 ### Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. # src_len = maximum source sentence length, b = batch size, e = embedding size. # torch.nn.Embedding is often used to store word embeddings and retrieve them using indices. # The input to the module is a list of indices, and the output is the corresponding word embeddings. # Input: (*), LongTensor of arbitrary shape containing the indices to extract # Output: (*, H), where * is the input shape and H=embedding_dim X = self.model_embeddings.source( source_padded) # (src_len, b) -> (src_len, b, e) ### step 2 ### Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. # Remove pad and merge short sequences into one long sequence # https://www.cnblogs.com/sbj123456789/p/9834018.html # returns a PackedSequence object, which has two attributes : data & batch_size X = pack_padded_sequence(X, lengths=source_lengths) # Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. # encoder: LSTM # Inputs: input, (h_0, c_0); input of shape (seq_len, batch, input_size); # The input can also be a packed variable length sequence. # h_0 and c_0 are of shape (num_layers * num_directions, batch, hidden_size) # If the LSTM is bidirectional, num_directions should be 2, else it should be 1. # If (h_0, c_0) is not provided, both h_0 and c_0 default to zero. # Outputs: output, (h_n, c_n); output of shape (seq_len, batch, num_directions * hidden_size) # If a PackedSequence has been given as the input, the output will also be a packed sequence. # h_n and c_0 are of shape (num_layers * num_directions, batch, hidden_size) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) # Pads a packed batch of variable length sequences. Inverse operation to pack_padded_sequence(). enc_hiddens, _ = pad_packed_sequence( enc_hiddens) # (src_len, b, h * 2) # Returns a tensor that is a transposed version of input. The given dimensions dim0 and dim1 are swapped. enc_hiddens = enc_hiddens.transpose(0, 1) # (b, src_len, h * 2) ### step 3 ### Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size # concatenates the given sequence of seq tensors in the given dimension. # All tensors must either have the same shape (except in the concatenating dimension) or be empty. last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) # (2, b, h) -> (b, h * 2) # h_0^{dec} = W_h[\mathop{h_1^{enc}}^{\leftarrow}, \mathop{h_m^{enc}}^{\rightarrow}] init_decoder_hidden = self.h_projection(last_hidden) last_cell = torch.cat((last_cell[0], last_cell[1]), 1) # c_0^{dec} = W_c[\mathop{c_1^{enc}}^{\leftarrow}, \mathop{c_m^{enc}}^{\rightarrow}] init_decoder_cell = self.c_projection(last_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### step 1 ### Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`. # W_{attProj}h_i^{enc} enc_hiddens_proj = self.att_projection( enc_hiddens ) # (b, src_len, h * 2) dot (h * 2, h) -> (b, src_len, h) ### step 2 ### Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. Y = self.model_embeddings.target( target_padded) # (tgt_len, b) -> (tgt_len, b, e) ### step 3 # torch.split(tensor, split_size_or_sections, dim=0) # Splits the tensor into chunks. Each chunk is a view of the original tensor. # If split_size_or_sections is an integer type, then tensor will be split into equally sized chunks (if possible). # Last chunk will be smaller if the tensor size along the given dimension dim is not divisible by split_size. for Y_t in torch.split(Y, 1, dim=0): # (tgt_len, b, e) -> (1, b, e) # Returns a tensor with all the dimensions of input of size 1 removed. squeezed = torch.squeeze(Y_t) # (1, b, e) -> (b, e) Ybar_t = torch.cat((squeezed, o_prev), dim=1) # (b, e) + (b, h) -> (b, e + h) dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t ### step 4 # Concatenates a sequence of tensors along a new dimension. combined_outputs = torch.stack( combined_outputs, dim=0) # list of (b, h) -> (tgt_len, b, h) return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None # Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. # h_t^{dec}, c_t^{dec} = decoder(\overline(y_t),h_{t-1}^{dec},c_{t-1}^{dec}) dec_state = self.decoder(Ybar_t, dec_state) # Split dec_state into its two parts (dec_hidden, dec_cell) = dec_state # (b, 2 * h) -> ((b, h), (b, h)) # batched matrix multiplication # (b, src_len, h) .dot(b, h, 1) -> (b, src_len, 1) -> (b, src_len) # unsqueeze - Returns a new tensor with a dimension of size one inserted at the specified position. # e_{t, i} = (h_t^{dec})^{\top}W_{attProj}h_i^{enc} e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) # \alpha_t = Softmax(e_t) alpha_t = torch.unsqueeze(F.softmax(e_t, dim=1), dim=1) # (b, src_len) -> (b, 1, src_len) # (b, 1, src_len) * (b, src_len, 2*h) -> (b, 1, 2*h) -> (b, 2*h) # a_t = \sum_i^m\alpha_{t, i}h_i^{enc} a_t = torch.squeeze(torch.bmm(alpha_t, enc_hiddens), dim=1) # u_t = [a_t;h_t^{dec}] U_t = torch.cat((a_t, dec_hidden), dim=1) # v_t = W_uu_t V_t = self.combined_output_projection(U_t) # o_t = Dropout(Tanh(v_t)) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) print(prev_hyp_ids) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ 基于注意力机制的seq2seq神经机器转换模型: - 双向 LSTM Encoder - 单向 LSTM Decoder - 全局注意力模型 """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ 初始化 NMT 模型. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): 词总述,包括 src 和 tgt @param dropout_rate (float): 对注意力的dropout概率 """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # 初始化各层次 # LSTM层 输入词嵌入,输出隐藏状态 self.encoder = nn.LSTM(embed_size, self.hidden_size, dropout=self.dropout_rate, bidirectional=True) # 可以选择双向 # LSTMCell 输入词嵌入与隐藏状态连接,输出隐藏状态 self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size) # 可以控制每个时间步 self.h_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.c_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) # 降维3h->h self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # 输出投影到词库 self.dropout = nn.Dropout(p=self.dropout_rate) def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ 取一个mini-batch的源句子和目标句子, 在NMT系统下学习的语言模型,计算目标句子的似然对数 @param source (List[List[str]]): 源句子列表 @param target (List[List[str]]): 目标句子列表, 被 `<s>` 和 `</s>` 包裹 @returns scores (Tensor): 形状 (b, ) 的变量或张量,表示对输入的batch的每个例子,标准目标句子的似然对数 这里 b = batch size. """ # 计算每个源句子的长度 source_lengths = [len(s) for s in source] # 把列表转换成向量 source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) # 运行网络的前向传播 enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # 生成掩码,让目标文本中生成标记<pad>的概率归零化 target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # 计算生成真实的目标文本的概率对数 target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) # 求和 return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ 在源句子上应用encoder来得到encoder隐藏状态 然后,取出encoder最后的状态,把他们投影成decoder初始状态 @param source_padded (Tensor): 形状 (src_len, b) 的填充好的源句子的张量, b = batch_size, src_len = 源句子的最大长度. 已按照最长到最短长度排序 @param source_lengths (List[int]): batch 中每个源句子的实际长度列表 @returns enc_hiddens (Tensor): 形状 (b, src_len, h*2) 的隐藏单元张量, b = batch size, src_len = 源句子的最大长度, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): 表示 decoder 的初始隐藏状态和细胞状态的张量元组 """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings.source(source_padded) # 输入经过embedding层 X = pack_padded_sequence(X, source_lengths) # 应用pack_padded_sequence enc_hiddens, (last_hidden, last_cell) = self.encoder( X) # 经过encoder层,得到每个时间步的隐藏状态和最后时间步的隐藏状态和细胞状态 enc_hiddens = pad_packed_sequence( enc_hiddens, batch_first=True)[0] # (b, src_len, h*2) # 连接Encoder得到的前向和反向的隐藏状态or细胞状态,然后用线性层初始化Decoder隐藏状态or细胞状态 init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), dim=1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), dim=1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) # 形成元组 return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """对每个 batch 计算连接的输出向量 @param enc_hiddens (Tensor): 隐藏状态 (b, src_len, h*2), b = batch size, src_len = 源句子的最大长度, h = hidden size. @param enc_masks (Tensor): 句子掩码张量 (b, src_len), b = batch size, src_len = 源句子的最大长度. @param dec_init_state (tuple(Tensor, Tensor)): deocder 初始的隐藏状态和细胞状态 @param target_padded (Tensor): 标准填充好的目标句子 (tgt_len, b), tgt_len = 目标句子的最大长度, b = batch size. @returns combined_outputs (Tensor): 连接输出的张量 (tgt_len, b, h), tgt_len = 目标句子的最大长度, b = batch_size, h = hidden size """ # 在最大长度的句子中去掉<END>标识 target_padded = target_padded[:-1] # 初始化解码器状态(隐藏和细胞) dec_state = dec_init_state # 初始化上一步连接好的输出向量 o_0为零向量 batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # 初始化一个列表,用于收集每一个时间步连接好的输出向量 o_t combined_outputs = [] enc_hiddens_proj = self.att_projection( enc_hiddens) # (b, src_len, h) 计算 W_attProj·h_enc Y = self.model_embeddings.target(target_padded) # (tgt_len, b, e) for Y_t in torch.split(Y, 1): # (1, b, e) 在0维度上拆分Y,每个大小为1 Y_t = torch.squeeze(Y_t, dim=0) # (b, e) Ybar_t = torch.cat((Y_t, o_prev), dim=1) # (b, e+h) dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) # 计算 combined_outputs.append(o_t) o_prev = o_t # 上一步输出的指针指向本次输出的 o_t combined_outputs = torch.stack( combined_outputs, dim=0) # (b, h)->(tgt_len, b, h) 堆积成目标形状 return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ 计算 LSTM decoder 的每个前向步, 包括注意力计算. @param Ybar_t (Tensor): 连接好的张量 [Y_t o_prev], 形状 (b, e + h). decoder 的输入 b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): 张量元组 形状都为 (b, h), b = batch size, h = hidden size. 第一个张量是 decoder 的先前的隐藏状态, 第二个张量是 decoder 的先前的细胞状态. @param enc_hiddens (Tensor): Encoder 隐藏状态张量, 形状 (b, src_len, h * 2), b = batch size, src_len = 源的最大长度, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder 隐藏状态张量, 从 (h * 2) 投影成 h. 张量形状 (b, src_len, h), b = batch size, src_len = 源的最大长度, h = hidden size. @param enc_masks (Tensor): 句子掩码张量,形状 (b, src_len), b = batch size, src_len = 源的最大长度. @returns dec_state (tuple (Tensor, Tensor)): 张量元组 两个张量形状都为 (b, h), b = batch size, h = hidden size. 第一个张量是 decoder 的新隐藏状态, 第二个张量是 decoder 的新细胞状态. @returns combined_output (Tensor): 第t步连接的输出张量, 形状 (b, h), b = batch size, h = hidden size. @returns e_t (Tensor): 张量,形状 (b, src_len). 注意力分数分布. 这个函数之外不会使用到。 """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state # 分开成两部分 # (b, src_len) = (b, src_len, h)·(b, h) -> (b, src_len, 1) = (b, src_len, h)·(b, h, 1) e_t = torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, dim=2)) # (b, src_len, 1) e_t = torch.squeeze(e_t, dim=2) # (b, src_len) # 把掩码为1的e_t置为负无穷 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) # 计算 alpha_t = F.softmax(e_t, dim=1) # (b, src_len) alpha_t = torch.unsqueeze(alpha_t, dim=1) # (b, 1, src_len) a_t = torch.bmm( alpha_t, enc_hiddens) # (b, 1, h * 2) = (b, 1, src_len)·(b, src_len, h * 2) a_t = torch.squeeze(a_t, dim=1) # (b, 2h) U_t = torch.cat((a_t, dec_hidden), dim=1) # (b, 3h) V_t = self.combined_output_projection(U_t) # (b, h) O_t = self.dropout(torch.tanh(V_t)) # (b, h) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ 对 encoder 隐藏状态生成句子掩码 @param enc_hiddens (Tensor): 需要编码的张量,形状 (b, src_len, 2*h), b = batch size, src_len = 源的最大长度, h = hidden size. @param source_lengths (List[int]): batch 中每个句子的实际长度. @returns enc_masks (Tensor): 句子掩码的张量,形状 (b, src_len), src_len = 源的最大长度, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) # 掩码矩阵 for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 # 超过真实句子长度的地方置为1,如<pad>标记 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ 给定单个源句子, 运行 beam search, 生成目标形式的结果. @param src_sent (List[str]): 一个源句子(词) @param beam_size (int): beam size,候选数 @param max_decoding_time_step (int): 展开解码 RNN 的最大时间步 @returns hypotheses (List[Hypothesis]): 假设列表, 每个假设有两个域: value: List[str]: 解码的目标句子, 用词序列表示 score: float: 目标句子的对数似然 """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) # (1,) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) # 候选句数量 exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) # att_t形状(b, h) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # 目标文字的概率对数 log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) # (词表长,) live_hyp_num = beam_size - len( completed_hypotheses) # beam_size - 完成的句子数 contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view( -1) # (候选句数,词表长) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) # (候选句数,k) prev_hyp_ids = top_cand_hyp_pos / len( self.vocab.tgt) # 前序候选词id矩阵(候选句数,k) hyp_word_ids = top_cand_hyp_pos % len( self.vocab.tgt) # 候选词id矩阵(候选句数,k) new_hypotheses = [] # 新候选句子 live_hyp_ids = [] # 剩余候选id new_hyp_scores = [] # 新候选句分数 # 按照k的数量迭代更新候选句 for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': # 完成 completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: # 候选项达最大值 break # 更新下一个状态 live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: # 候选项为空 completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) # 得分降序 return completed_hypotheses @property def device(self) -> torch.device: """ 决定使用CPU或GPU去放置张量. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ 从文件中加载模型. @param model_path (str): 模型路径 """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ 保存模型到文件. @param path (str): 模型路径 """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() # 包含lr学习率 } torch.save(params, path)
class DPPNMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size=256, hidden_size=256, vocab=None, dropout_rate=0.2, nmt_model=None): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(DPPNMT, self).__init__() if nmt_model is not None: self.model_embeddings = nmt_model.model_embeddings self.hidden_size = nmt_model.hidden_size self.dropout_rate = nmt_model.dropout_rate self.vocab = nmt_model.vocab self.encoder = nmt_model.encoder self.decoder = nmt_model.decoder self.h_projection = nmt_model.h_projection self.c_projection = nmt_model.c_projection self.att_projection = nmt_model.att_projection self.combined_output_projection = nmt_model.combined_output_projection self.target_vocab_projection = nmt_model.target_vocab_projection self.dropout = nmt_model.dropout else: self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size, hidden_size=hidden_size, bias=True) self.h_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.c_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.att_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.combined_output_projection = nn.Linear( in_features=3 * hidden_size, out_features=hidden_size, bias=False) self.target_vocab_projection = nn.Linear(in_features=hidden_size, out_features=len( vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute X = self.model_embeddings.source(source_padded) enc_hiddens, (last_hidden, last_cell) = self.encoder( torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths)) enc_hiddens, _ = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) last_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1) init_decoder_hidden = self.h_projection(last_hidden) last_cell = torch.cat((last_cell[0], last_cell[1]), dim=1) init_decoder_cell = self.c_projection(last_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings.target(target_padded) for Y_t in torch.split(Y, 1): Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat((Y_t, o_prev), dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs) ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state e_t = torch.squeeze(torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, dim=2)), dim=2) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = nn.functional.softmax(e_t, dim=1) a_t = torch.squeeze(torch.bmm(torch.unsqueeze(alpha_t, dim=1), enc_hiddens), dim=1) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) # (num_hyps x target_embed_size) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) ###### START TOP K HERE ####### # top_cand_hyp_scores, top_cand_hyp_pos = self.topk(contiuating_hyp_scores, live_hyp_num) ###### END TOP K HERE ####### ###### START DPP HERE ####### top_cand_hyp_scores, top_cand_hyp_pos = self.kdpp( att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size, ) #### END DPP HERE #### prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) if PRINT_HYPOTHESES: print(completed_hypotheses) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] nmt_model = NMT(vocab=params['vocab'], **args) nmt_model.load_state_dict(params['state_dict']) model = DPPNMT(nmt_model=nmt_model) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def timer(self, message=None): if PRINT_TIMER: if message is None or not hasattr( self, "last_time") or self.last_time is None: self.last_time = time.time() else: new_time = time.time() print("%s: %f" % (message, new_time - self.last_time)) self.last_time = new_time def topk(self, contiuating_hyp_scores, live_hyp_num): top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) return top_cand_hyp_scores, top_cand_hyp_pos def word_embeddings(self): if not hasattr(self, "word_embeddings_cached"): self.timer() word_ids = torch.tensor([ self.vocab.tgt[self.vocab.tgt.id2word[id]] for id in range(len(self.vocab.tgt)) ], dtype=torch.long, device=self.device) self.word_embeddings_cached = embeddings = self.model_embeddings.target( word_ids) if TOGGLE_PRINT: print("embeddings", embeddings.shape) self.timer("Embeddings") return self.word_embeddings_cached def kdpp(self, att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size): # for every element in contiuating_hyp_scores, I need to get the target # word embedding, take another step, get that output, normalize, and multiply by # the corresponding element of log_p_t # TODO: need to duplicate each num_hyps times self.timer() top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE_RATIO * beam_size) self.timer("topk") vocab_size = len(self.vocab.tgt.word2id) num_hyps, embed_size = att_t.shape # TODO: minimize data movement # print("x", x.shape) # att_t_repeated = att_t.repeat(1, vocab_size).view(-1, embed_size) # embeddings_repeated = embeddings.repeat(1, vocab_size).view(-1, embed_size) # x = torch.cat([embeddings_repeated, att_t_repeated], dim=-1) # x = x[top_cand_hyp_pos] embeddings = self.word_embeddings() # print(top_cand_hyp_pos) x_list = [] for hyp_pos in top_cand_hyp_pos: emb_hyp = embeddings[hyp_pos % vocab_size] att_hyp = att_t[hyp_pos / vocab_size] x_partial = torch.cat([emb_hyp, att_hyp]) x_list.append(x_partial.unsqueeze(0)) x = torch.cat(x_list, dim=0) self.timer("newx") batch_size = x.shape[0] new_exp_src_encodings = src_encodings.expand(batch_size, src_encodings.size(1), src_encodings.size(2)) new_exp_src_encodings_att_linear = src_encodings_att_linear.expand( batch_size, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) # Might have to stretch h_t, and cell_t # new_h_t = h_t.repeat(1, vocab_size).view(-1, embed_size) # new_cell_t = cell_t.repeat(1, vocab_size).view(-1, embed_size) # new_h_t = new_h_t[top_cand_hyp_pos] # new_cell_t = new_cell_t[top_cand_hyp_pos] self.timer() new_h_t_list = [] new_cell_t_list = [] for hyp_pos in top_cand_hyp_pos: h_t_hyp = h_t[hyp_pos / vocab_size] cell_t_hyp = cell_t[hyp_pos / vocab_size] new_h_t_list.append(h_t_hyp.unsqueeze(0)) new_cell_t_list.append(cell_t_hyp.unsqueeze(0)) new_h_t = torch.cat(new_h_t_list, dim=0) new_cell_t = torch.cat(new_cell_t_list, dim=0) self.timer("new_h_t/cell_t") (h_t_dpp, _), _, _ = self.step(x, (new_h_t, new_cell_t), new_exp_src_encodings, new_exp_src_encodings_att_linear, enc_masks=None) self.timer("step") # num_hyps = len(contiuating_hyp_scores.shape[0])/len(self.vocab.tgt) norms = torch.norm(h_t_dpp, p=2, dim=1, keepdim=True) if norms.is_cuda: norms = norms.cpu() unit_vectors = h_t_dpp.div(norms.expand_as(h_t_dpp)) # new_p_t = log_p_t.repeat(1, vocab_size).view(-1, vocab_size) # print("new_p_t", log_p_t.shape) # TODO: this returns e^{scores}... correct? quality_scores = torch.exp( top_cand_hyp_scores.unsqueeze(1)).expand_as(unit_vectors) # TODO: maybe normalize the quality_scores? quality_scores = torch.pow(quality_scores, 1 / 2) features = unit_vectors * quality_scores self.timer("scores") L = torch.mm(features, features.t()) self.timer("L") try: new_top_cand_hyp_pos = sample_k_dpp(L, k=live_hyp_num) except Exception as e: print("Error sampling from L, falling back to top k: %s" % e) return self.topk(contiuating_hyp_scores, live_hyp_num) if ADD_TOP_N > 0: new_top_cand_hyp_pos = np.unique( np.append(new_top_cand_hyp_pos, range(ADD_TOP_N))) self.timer("sample_k_dpp") top_cand_hyp_pos = top_cand_hyp_pos[new_top_cand_hyp_pos] # top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos].squeeze(0) top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos] scores1, pos1 = self.topk(contiuating_hyp_scores, live_hyp_num) # print('topk pos', pos1) # print('top_cand_hyp_pos', top_cand_hyp_pos) # print('topk scores', scores1) # print('top_cand_hyp_pos', top_cand_hyp_scores) if TOGGLE_PRINT: print("vocab size", vocab_size) print("att_t_repeated", att_t_repeated.shape) print("top_cand_hyp_pos", top_cand_hyp_pos.shape) print("new_x", x.shape) print("src_encodings", new_exp_src_encodings.shape) print("src_encodings_att", new_exp_src_encodings_att_linear.shape) print("new_h_t", new_h_t.shape) print("new_cell_t", new_cell_t.shape) print("hidden", h_t_dpp.shape) print("norms", norms.shape) print("unit_vectors", unit_vectors.shape) print("L", L.shape) print("L", L) print("new_top_cand_hyp_pos", new_top_cand_hyp_pos) print(top_cand_hyp_pos) print("new_top_hyp_pos", top_cand_hyp_pos.shape) print("new_top_hyp_scores", top_cand_hyp_scores.shape) print('top chosen: ', new_top_cand_hyp_pos) return top_cand_hyp_scores, top_cand_hyp_pos