class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors ## A4 code # source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) # target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) # enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) # enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) # combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) ## End A4 code ### YOUR CODE HERE for part 1k ### TODO: ### Modify the code lines above as needed to fetch the character-level tensor ### to feed into encode() and decode(). You should: ### - Keep `target_padded` from A4 code above for predictions ### - Add `source_padded_chars` for character level padded encodings for source ### - Add `target_padded_chars` for character level padded encodings for target ### - Modify calls to encode() and decode() to use the character level encodings target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) source_padded_chars = self.vocab.src.to_input_tensor_char(source, device=self.device) # Tensor: (src_len, b) target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device=self.device) # Tensor: (tgt_len, b) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) ### END YOUR CODE P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum() # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].contiguous().view(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### Except replace "self.model_embeddings.source" with "self.model_embeddings_source" X = self.model_embeddings_source(source_padded) X_packed = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed) (enc_hiddens, _) = pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) init_decoder_hidden = self.h_projection(torch.cat((last_hidden[0], last_hidden[1]), dim=1)) init_decoder_cell = self.c_projection(torch.cat((last_cell[0], last_cell[1]), dim=1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE FROM ASSIGNMENT 4 return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### Except replace "self.model_embeddings.target" with "self.model_embeddings_target" enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, split_size_or_sections=1): Y_t = Y_t.squeeze(0) Ybar_t = torch.cat([Y_t, o_prev], dim=-1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs) ### END YOUR CODE FROM ASSIGNMENT 4 return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(2)).squeeze(2) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 alpha_t = F.softmax(e_t, dim=-1) alpha_t_view = (alpha_t.size(0), 1, alpha_t.size(1)) a_t = torch.bmm(alpha_t.view(*alpha_t_view), enc_hiddens).squeeze(1) U_t = torch.cat([dec_hidden, a_t], 1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ ## A4 code # src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) ## End A4 code src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) ## A4 code # y_tm1 = self.vocab.tgt.to_input_tensor(list([hyp[-1]] for hyp in hypotheses), device=self.device) # y_t_embed = self.model_embeddings_target(y_tm1) ## End A4 code y_tm1 = self.vocab.tgt.to_input_tensor_char(list([hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>"+str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack(decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy((decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size()[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])]#[:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class DPPNMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False, nmt_model=None): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention @param nmt_model (NMT): a5 NMT Model (without DPP) to initialize layers with """ super(DPPNMT, self).__init__() if nmt_model is None: self.model_embeddings_source = ModelEmbeddings( embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings( embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.embed_size = embed_size self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None else: self.model_embeddings_source = nmt_model.model_embeddings_source self.model_embeddings_target = nmt_model.model_embeddings_target self.hidden_size = nmt_model.hidden_size self.dropout_rate = nmt_model.dropout_rate self.vocab = nmt_model.vocab self.embed_size = nmt_model.model_embeddings_source.embed_size self.encoder = nmt_model.encoder self.decoder = nmt_model.decoder self.h_projection = nmt_model.h_projection self.c_projection = nmt_model.c_projection self.att_projection = nmt_model.att_projection self.combined_output_projection = nmt_model.combined_output_projection self.target_vocab_projection = nmt_model.target_vocab_projection self.dropout = nmt_model.dropout self.charDecoder = nmt_model.charDecoder def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) source_padded_chars = self.vocab.src.to_input_tensor_char( source, device=self.device) target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, device=self.device) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].contiguous().view( -1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings_source(source_padded) X_packed = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed) (enc_hiddens, _) = pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), dim=1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), dim=1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, split_size_or_sections=1): Y_t = Y_t.squeeze(0) Ybar_t = torch.cat([Y_t, o_prev], dim=-1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs) return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(2)).squeeze(2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) alpha_t = F.softmax(e_t, dim=-1) alpha_t_view = (alpha_t.size(0), 1, alpha_t.size(1)) a_t = torch.bmm(alpha_t.view(*alpha_t_view), enc_hiddens).squeeze(1) U_t = torch.cat([dec_hidden, a_t], 1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: if PRINT_HYPOTHESIS_TREE: print(sorted(hypotheses)) t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) if TOGGLE_PRINT: print("att_tm1", att_tm1.shape) # num_hyps x target_embed_size print("y_t_embed", y_t_embed.shape) print("x", x.shape) print("h_tm1", h_tm1[0].shape, h_tm1[1].shape) # same as x print("h_t", h_t.shape) print("cell_t", cell_t.shape) print("att_t", att_t.shape) print(hypotheses) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) ###### START TOP K HERE ####### # top_cand_hyp_scores, top_cand_hyp_pos = self.topk(contiuating_hyp_scores, live_hyp_num) ###### END TOP K HERE ####### ###### START DPP HERE ####### top_cand_hyp_scores, top_cand_hyp_pos = self.kdpp( att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size, ) if TOGGLE_PRINT: top_cand_hyp_scores_topk, top_cand_hyp_pos_topk = self.topk( contiuating_hyp_scores, live_hyp_num) print('topk', top_cand_hyp_scores_topk) print('kdpp', top_cand_hyp_scores) print('topk', top_cand_hyp_pos_topk) print('kdpp', top_cand_hyp_pos) #### END DPP HERE #### prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] #[:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) if PRINT_HYPOTHESES: print(completed_hypotheses) print("**********************") return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] nmt_model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) nmt_model.load_state_dict(params['state_dict']) model = DPPNMT(nmt_model=nmt_model, vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def timer(self, message=None): if PRINT_TIMER: if message is None or not hasattr( self, "last_time") or self.last_time is None: self.last_time = time.time() else: new_time = time.time() print("%s: %f" % (message, new_time - self.last_time)) self.last_time = new_time def topk(self, contiuating_hyp_scores, live_hyp_num): top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) return top_cand_hyp_scores, top_cand_hyp_pos def word_embeddings(self): if not hasattr(self, "word_embeddings_cached"): self.timer() words = [[self.vocab.tgt.id2word[id]] for id in range(len(self.vocab.tgt.word2id))] words_char_tensor = self.vocab.tgt.to_input_tensor_char( words, device=self.device) self.word_embeddings_cached = self.model_embeddings_target( words_char_tensor).squeeze(0) if TOGGLE_PRINT: print("embeddings", embeddings.shape) self.timer("Embeddings") return self.word_embeddings_cached def kdpp(self, att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size): # for every element in contiuating_hyp_scores, I need to get the target # word embedding, take another step, get that output, normalize, and multiply by # the corresponding element of log_p_t # TODO: need to duplicate each num_hyps times self.timer() #top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE_RATIO * beam_size) self.timer("topk") vocab_size = len(self.vocab.tgt.word2id) num_hyps, embed_size = att_t.shape # TODO: minimize data movement # print("x", x.shape) # att_t_repeated = att_t.repeat(1, vocab_size).view(-1, embed_size) # embeddings_repeated = embeddings.repeat(1, vocab_size).view(-1, embed_size) # x = torch.cat([embeddings_repeated, att_t_repeated], dim=-1) # x = x[top_cand_hyp_pos] embeddings = self.word_embeddings() # print(top_cand_hyp_pos) x_list = [] for hyp_pos in top_cand_hyp_pos: emb_hyp = embeddings[hyp_pos % vocab_size] att_hyp = att_t[hyp_pos / vocab_size] x_partial = torch.cat([emb_hyp, att_hyp]) x_list.append(x_partial.unsqueeze(0)) x = torch.cat(x_list, dim=0) self.timer("newx") batch_size = x.shape[0] new_exp_src_encodings = src_encodings.expand(batch_size, src_encodings.size(1), src_encodings.size(2)) new_exp_src_encodings_att_linear = src_encodings_att_linear.expand( batch_size, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) # Might have to stretch h_t, and cell_t # new_h_t = h_t.repeat(1, vocab_size).view(-1, embed_size) # new_cell_t = cell_t.repeat(1, vocab_size).view(-1, embed_size) # new_h_t = new_h_t[top_cand_hyp_pos] # new_cell_t = new_cell_t[top_cand_hyp_pos] self.timer() new_h_t_list = [] new_cell_t_list = [] for hyp_pos in top_cand_hyp_pos: h_t_hyp = h_t[hyp_pos / vocab_size] cell_t_hyp = cell_t[hyp_pos / vocab_size] new_h_t_list.append(h_t_hyp.unsqueeze(0)) new_cell_t_list.append(cell_t_hyp.unsqueeze(0)) new_h_t = torch.cat(new_h_t_list, dim=0) new_cell_t = torch.cat(new_cell_t_list, dim=0) self.timer("new_h_t/cell_t") (h_t_dpp, _), _, _ = self.step(x, (new_h_t, new_cell_t), new_exp_src_encodings, new_exp_src_encodings_att_linear, enc_masks=None) self.timer("step") # num_hyps = len(contiuating_hyp_scores.shape[0])/len(self.vocab.tgt) norms = torch.norm(h_t_dpp, p=2, dim=1, keepdim=True) #if norms.is_cuda: # norms = norms.cpu() unit_vectors = h_t_dpp.div(norms.expand_as(h_t_dpp)) # new_p_t = log_p_t.repeat(1, vocab_size).view(-1, vocab_size) # print("new_p_t", log_p_t.shape) # TODO: this returns e^{scores}... correct? quality_scores = torch.exp( top_cand_hyp_scores.unsqueeze(1)).expand_as(unit_vectors) # TODO: maybe normalize the quality_scores? quality_scores = torch.pow(quality_scores, 1 / 2) features = unit_vectors * quality_scores self.timer("scores") L = torch.mm(features, features.t()).cpu() self.timer("L") try: new_top_cand_hyp_pos = sample_k_dpp(L, k=live_hyp_num) except Exception as e: print("Error sampling from L, falling back to top k: %s" % e) return self.topk(contiuating_hyp_scores, live_hyp_num) if ADD_TOP_N > 0: new_top_cand_hyp_pos = np.unique( np.append(new_top_cand_hyp_pos, range(ADD_TOP_N))) self.timer("sample_k_dpp") top_cand_hyp_pos = top_cand_hyp_pos[new_top_cand_hyp_pos] # top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos].squeeze(0) top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos] scores1, pos1 = self.topk(contiuating_hyp_scores, live_hyp_num) # print('topk pos', pos1) # print('top_cand_hyp_pos', top_cand_hyp_pos) # print('topk scores', scores1) # print('top_cand_hyp_pos', top_cand_hyp_scores) if TOGGLE_PRINT: print("vocab size", vocab_size) print("att_t_repeated", att_t_repeated.shape) print("top_cand_hyp_pos", top_cand_hyp_pos.shape) print("new_x", x.shape) print("src_encodings", new_exp_src_encodings.shape) print("src_encodings_att", new_exp_src_encodings_att_linear.shape) print("new_h_t", new_h_t.shape) print("new_cell_t", new_cell_t.shape) print("hidden", h_t_dpp.shape) print("norms", norms.shape) print("unit_vectors", unit_vectors.shape) print("L", L.shape) print("L", L) print("new_top_cand_hyp_pos", new_top_cand_hyp_pos) print(top_cand_hyp_pos) print("new_top_hyp_pos", top_cand_hyp_pos.shape) print("new_top_hyp_scores", top_cand_hyp_scores.shape) print('top chosen: ', new_top_cand_hyp_pos) return top_cand_hyp_scores, top_cand_hyp_pos
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Initalize the NMT Model. :param int embed_size: Embedding size (dimensionality) :param int hidden_size: Hidden Size (dimensionality) :param Vocab vocab: Vocabulary object containing src and tgt languages See vocab.py for documentation. :param float dropout_rate: Dropout probability, for the attention combination layer """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) # Need to feed in transpose of [h_enc(1)(<-) ; h_enc(m)(->)], and output is 1xh self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of [c_enc(1)(<-); c_enc(m)(->)], and output is 1xh self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of u(t), and output is 1xh (v(t)) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of o(t), and output is 1x|Vtg| self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. :param List[List[str]] source: list of source sentence tokens :param List[List[str]] target: list of target sentence tokens, wrapped by `<s>` and `</s>` :return Tensor: a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) source_padded_chars = self.vocab.src.to_input_tensor_char(source, device=self.device) target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device=self.device) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) # Compute the softmax scores for all hidden states from the decoder (all in the batch, including masked ones) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text (we get zeros for pad tokens) target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words (ignoring the start token) target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze( -1) * target_masks[1:] scores = target_gold_words_log_prob.sum() if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].reshape(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> \ Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. :param Tensor source_padded: Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length (already sorted in order of longest to shortest sentence). :param List[int] source_lengths: List of actual lengths for each of the source sentences in the batch. :return Tensor: Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. :return tuple(Tensor, Tensor): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings_source(source_padded) X = nn.utils.rnn.pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) enc_hiddens, _ = nn.utils.rnn.pad_packed_sequence(enc_hiddens, batch_first=True) init_decoder_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) init_decoder_hidden = self.h_projection(init_decoder_hidden) init_decoder_cell = torch.cat((last_cell[0], last_cell[1]), 1) init_decoder_cell = self.c_projection(init_decoder_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. :param Tensor enc_hiddens: Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. :param Tensor enc_masks: Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. :param tuple(Tensor, Tensor) dec_init_state: Initial state and cell for decoder :param Tensor target_padded: Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. :return Tensor: combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Remove the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step (output of each decoder step) combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, 1, 0): Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat((Y_t, o_prev), dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, dim=0) return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. :param Tensor Ybar_t: Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. :param tuple(Tensor, Tensor) dec_state: Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. :param Tensor enc_hiddens: Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. :param Tensor enc_hiddens_proj: Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is of shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. :param Tensor enc_masks: Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. :return tuple(Tensor, Tensor): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. :return Tensor: Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. :return Tensor: Tensor of shape (b, src_len). It is attention scores distribution. """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state batch2 = torch.unsqueeze(dec_hidden, 2) e_t = torch.bmm(enc_hiddens_proj, batch2) e_t = torch.squeeze(e_t, dim=2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) alpha_t = nn.functional.softmax(e_t, 1) alpha_t = torch.unsqueeze(alpha_t, dim=1) a_t = torch.bmm(alpha_t, enc_hiddens) a_t = torch.squeeze(a_t, dim=1) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. :param Tensor enc_hiddens: encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. :param List[int] source_lengths: List of actual lengths for each of the sentences in the batch. :return Tensor: Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[ Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. :param List[str] src_sent: a single source sentence (words) :param int beam_size: beam size :param int max_decoding_time_step: maximum number of time steps to unroll the decoding RNN :return List[Hypothesis]: a list of hypothesis, each hypothesis has two fields: value List[str]: the decoded target sentence, represented as a list of words score float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list([hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoder_states_for_unks = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoder_states_for_unks)) decoder_states_for_unks.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoder_states_for_unks) > 0 and self.charDecoder is not None: # decode UNKs decoder_states_for_unks = torch.stack(decoder_states_for_unks, dim=0) decoded_words = self.charDecoder.decode_greedy((decoder_states_for_unks.unsqueeze(0), decoder_states_for_unks.unsqueeze(0)), max_length=21, device=self.device) assert len(decoded_words) == decoder_states_for_unks.size()[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decoded_words[int(hyp[-1][5:])] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. :param str model_path: path to model :param boolean no_char_decoder: whether the char-level decoder is also used """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the model to a file. :param str path: path to the model parameters """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 self.encoder = nn.LSTM(word_embed_size, hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(word_embed_size + hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # Wh self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # Wc self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 3, hidden_size, bias=False) # Wu self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) # Wvocab self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of one number representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors # source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) ### YOUR CODE HERE for part 1i ### TODO: ### Modify the code lines above as needed to fetch the character-level tensor ### to feed into encode() and decode(). You should: ### - Keep `target_padded` from A4 code above for predictions target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### - Add `source_padded_chars` for character level padded encodings for source source_padded_chars = self.vocab.src.to_input_tensor_char( source, self.device) # tensor of (max_sentence_length, bs, max_word_length) ### - Add `target_padded_chars` for character level padded encodings for target target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, self.device) ### - Modify calls to encode() and decode() to use the character level encodings enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) ### END YOUR CODE P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: # note that max_sentence_length is the number of words for the longest sentence # max_word_length is the number of characters for the longest word (of the entire batch) # and bs is the number of sentences in 1 batch # target_padded_chars: tensor of (max_sentence_length, bs, max_word_length) max_word_len = target_padded_chars.shape[-1] # target_padded: tensor of (max_sent_length, bs) target_words = target_padded[1:].contiguous().view( -1) # (max_sent_length * bs), this is not used target_chars = target_padded_chars[1:].view( -1, max_word_len) # (max_sent_length * bs, max_word_length). # Note that we skip the first word of each sentence, which is a <sentence start> token # combined_outputs: tensor shape (tgt_len, bs, hidden size) target_outputs = combined_outputs.view( -1, 256) # (max_sent_length * bs, hidden size) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t().contiguous(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. X = self.model_embeddings_source( source_padded) #(sen_len,bs,wordemb_sz) ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. X = pack_padded_sequence( X, source_lengths ) # since we pad each sentence, we need to call this to pack them into tensor enc_hiddens, (last_hidden, last_cell) = self.encoder(X) # foir LSTM, if (h_0, c_0) is not provided, both h_0 and c_0 default to zero. # enc_hiddens: (sen,bs,h*2) # last_hidden: (2 b/c biLSTM,bs,h) # last_cell: (2,bs,h) ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. enc_hiddens, _ = pad_packed_sequence(enc_hiddens) # (sen_len,bs,h*2) ### - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to ### return a tensor of shape (bs, src_len, h*2) as `enc_hiddens`. enc_hiddens = enc_hiddens.permute(1, 0, 2) #(bs, src_len, h*2) ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size last_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1) # (2,b,h) -> (b,h*2) init_decoder_hidden = self.h_projection(last_hidden) ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards ### and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size last_cell = torch.cat((last_cell[0], last_cell[1]), dim=1) init_decoder_cell = self.c_projection(last_cell) dec_init_state = init_decoder_hidden, init_decoder_cell ### END YOUR CODE FROM ASSIGNMENT 4 return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zeros batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. enc_hiddens_proj = self.att_projection(enc_hiddens) # (b, src_len, h) ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. Y = self.model_embeddings_target( target_padded) # (tgt_len,bs,wordemb_sz) ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. for Y_t in torch.split( Y, 1, dim=0): # same as looping through 1st dimension of this tensor Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat([Y_t, o_prev], dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### combined_outputs = torch.stack(combined_outputs, dim=0) ### END YOUR CODE FROM ASSIGNMENT 4 return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. dec_state = self.decoder(Ybar_t, dec_state) ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) dec_hidden, dec_cell = dec_state ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!) ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # (bs,1,src_len) @ (bs,src_len,2h) = (bs,1,2h) = (bs,2h) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 # So that when do softmax on these paddings of this sentence, the attribution score will be 0 (e^-inf = 0) # example: sentence [il,a,m,entarte,<PAD>] (max source length = 5) will have enc_masks [0,0,0,0,1] # and with attribution score (pre_softmax) e_t such as [3,-1,0,-2,5], # 5 will be such a high attribution score for a meaningless padding # so we need to neutralize it by applying mask on so that e_t will be [3,-1,0,-2,-inf] if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply softmax to e_t to yield alpha_t alpha_t = F.softmax(e_t, dim=1) # (bs, src_len) ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### # att_view = (alpha_t.size(0), 1, alpha_t.size(1)) # a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1) # (b,2h,src_len) @ (b,src_len,1) a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t.unsqueeze(2)).squeeze(2) ### 3. Concatenate dec_hidden with a_t to compute tensor U_t U_t = torch.cat([dec_hidden, a_t], dim=1) ### 4. Apply the combined output projection layer to U_t to compute tensor V_t V_t = self.combined_output_projection(U_t) ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] # [:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(word_embed_size=self.model_embeddings_source.word_embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)