def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # default values self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=self.dropout_rate) # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional = True) self.decoder = nn.LSTMCell(embed_size+hidden_size, hidden_size) self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias = False) self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias = False) self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias = False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias = False) self.dropout = nn.Dropout(dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False, with_contex=False, contex_LSTM=False, multi_encoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() if with_contex and contex_LSTM: self.model_embeddings_source = ContexAwareEmbeddings_LSTM(word_embed_size, vocab.src) self.model_embeddings_target = ContexAwareEmbeddings_LSTM(word_embed_size, vocab.tgt) elif with_contex and not contex_LSTM: self.model_embeddings_source = ContexAwareEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ContexAwareEmbeddings(word_embed_size, vocab.tgt) else: self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.multi_encoder = multi_encoder ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bias=True, bidirectional=True) if self.multi_encoder: self.project_1 = nn.Linear(self.hidden_size * 2, self.hidden_size) self.encoder_2 = nn.LSTM(hidden_size, self.hidden_size, bias=True, bidirectional=True) self.project_2 = nn.Linear(self.hidden_size * 2, self.hidden_size) self.connect_2 = SelectiveConnect(self.hidden_size) self.encoder_3 = nn.LSTM(hidden_size, self.hidden_size, bias=True, bidirectional=True) self.project_3 = nn.Linear(self.hidden_size * 2, self.hidden_size) self.connect_3 = SelectiveConnect(self.hidden_size) self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) if self.multi_encoder: self.att_projection = nn.Linear(self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) else: self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) self.dropout = nn.Dropout(p=self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(word_embed_size, self.hidden_size, 1, bias=True, bidirectional=True) self.decoder = nn.LSTM((word_embed_size + self.hidden_size), self.hidden_size, 1, bias=True, bidirectional=False) self.h_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.c_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bidirectional=True, bias=True) self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear( self.hidden_size * 2, self.hidden_size, bias=False) # The final vector of hidden states self.c_projection = nn.Linear( self.hidden_size * 2, self.hidden_size, bias=False) # The final vector of cell states self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear( self.hidden_size, len(self.vocab.tgt), bias=False ) # We don't need embedding since we don't need to embed the output on low dimensions? self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.e_word = embed_size self.encoder = nn.LSTM(embed_size, hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear( hidden_size * 2, hidden_size, bias=False) # prj output of last h_state of encode (R^2h) to R^h self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear( hidden_size * 2, hidden_size, bias=False ) # 1 x 2h (h_encode_i) * 2h x h (W) * h * 1 (h_decode_t) = 1 x 1 = e_t,i self.combined_output_projection = nn.Linear( hidden_size * 3, hidden_size, bias=False) # use after combined attention output and h_decode self.target_vocab_projection = nn.Linear( hidden_size, len(vocab.tgt), bias=False) # for softmax of last self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def test_greedy_decode(): char_vocab = DummyVocab() decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) max_word_length = 21 decoder.forward = mock_decoder_forward initial_states = ( torch.tensor([[[0] * HIDDEN_SIZE] * BATCH_SIZE]), torch.tensor([[[0] * HIDDEN_SIZE] * BATCH_SIZE])) result = decoder.decode_greedy(initialStates=initial_states, device=decoder.char_output_projection.weight.device, max_length=max_word_length) for decoded_word in result: assert decoded_word == "a" * (max_word_length - 1)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (VocabEntry): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.embed_size = embed_size self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # Initiazlie the encoder variable # Bidirectional LSTM with bias self.encoder = nn.LSTM(input_size = self.embed_size, hidden_size = self.hidden_size, bidirectional = True, bias = True) # Initialize the decoder variable (LSTM cell with Bias) # input size is the embedding_size + attention output from the previous step self.decoder = nn.LSTMCell(input_size = self.embed_size + self.hidden_size, hidden_size = self.hidden_size, bias = True) # Initiazlie the linear layers for the linear projections self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias = False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias = False) # Initialize the attention_projection layer # This is to compute the attention scores (Default is dot product) self.att_projection = nn.Linear(2 * self.hidden_size , self.hidden_size, bias = False) # Initialize the outputprojection layer # This is to compoute the output vector by combining attention output and decode hidden state self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias = False) # Initialize the target vocab linear layer self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias = False) # Initialize dropout self.dropout = nn.Dropout(p = self.dropout_rate, inplace = False) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def main(): """ Main func. """ args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ >= "1.0.0"), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0 or greater".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) if args['1a']: question_1a_sanity_check() elif args['1b']: question_1b_sanity_check() elif args['1c']: question_1c_sanity_check() elif args['1d']: question_1d_sanity_check() elif args['1e']: question_1e_sanity_check() elif args['1f']: question_1f_sanity_check(model) elif args['2a']: question_2a_sanity_check(decoder, char_vocab) elif args['2b']: question_2b_sanity_check(decoder, char_vocab) elif args['2c']: question_2c_sanity_check(decoder) elif args['2c2']: question_2c2_sanity_check(decoder) elif args['2d']: question_2d_sanity_check(decoder) else: raise RuntimeError('invalid run mode')
def main(): """ Main func. """ # args = docopt(__doc__) char_emb_size = 50 # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ >= "1.0.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # cnn layer cnn = CNN(char_emb_size, EMBED_SIZE, 5) # Create NMT Model model = NMT(word_embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) # highway layer hiway_layer = Highway(EMBED_SIZE) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) # if args['1e']: # question_1e_sanity_check() # elif args['1f']: # question_1f_sanity_check(hiway_layer) # elif args['1g']: # question_1g_sanity_check(cnn) # elif args['1h']: # question_1h_sanity_check(model) # elif args['2a']: # question_2a_sanity_check(decoder, char_vocab) # elif args['2b']: # question_2b_sanity_check(decoder) # elif args['2c']: question_2c_sanity_check(decoder)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Initalize the NMT Model. :param int embed_size: Embedding size (dimensionality) :param int hidden_size: Hidden Size (dimensionality) :param Vocab vocab: Vocabulary object containing src and tgt languages See vocab.py for documentation. :param float dropout_rate: Dropout probability, for the attention combination layer """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) # Need to feed in transpose of [h_enc(1)(<-) ; h_enc(m)(->)], and output is 1xh self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of [c_enc(1)(<-); c_enc(m)(->)], and output is 1xh self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of u(t), and output is 1xh (v(t)) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of o(t), and output is 1x|Vtg| self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def main(): """ Main func """ args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ == "1.1.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) # initialize highway highway_model = Highway(embed_size_word=EMBED_SIZE_WORD, dropout_rate=DROPOUT_RATE) # initialize cnn cnn_model = CNN(EMBED_SIZE, MAX_WORD_LEN, EMBED_SIZE_WORD, 5) if args['hw']: highway_sanity_check(highway_model) elif args['generate_data']: generate_highway_data() elif args['gen_cnn_data']: generate_cnn_data() elif args['cnn']: cnn_sanity_check(cnn_model) else: raise RuntimeError('invalid run mode')
def test_char_encoder(): print("==="*30) print("\nCharDecoder/decode_greedy test") char_vocab = DummyVocab() HIDDEN_SIZE = 6 EMBED_SIZE = 3 BATCH_SIZE = 4 h0 = torch.randn(1, BATCH_SIZE, HIDDEN_SIZE, dtype=torch.float) c0 = torch.randn(1, BATCH_SIZE, HIDDEN_SIZE, dtype=torch.float) decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) start_time = time.time() decodedWords = decoder.decode_greedy((h0, c0), device) print("\n--- decode_greedy takes %s seconds ---" % (time.time() - start_time)) print("\n\ndecodedWords:\n{}\n".format(decodedWords)) print("\nCharDecoder/decode_greedy Test Passed!\n") print("==="*30)
def setUp(cls): # Initialize CharDecoder cls.decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) cls.vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # cl NMT Model cls.model = NMT( embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab ) cls.char_vocab = DummyVocab()
def main(): """ Main func. """ args = docopt(__doc__) seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize Highway highway = Highway(word_embed_size=EMBED_SIZE, dropout_rate=DROPOUT_RATE) # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) cnn = CNN(char_embed_size=EMBED_SIZE, num_filters=NUM_FILTER, max_word_length=MAX_WORD_LEN, kernel_size=KERNEl_SIZE) if args['highway']: question_1h_sanity_check(highway) elif args['cnn']: question_1g_sanity_check(cnn) elif args['generate']: question_1h_generate_data() question_1g_generate_data() else: raise RuntimeError('invalid run mode')
def main(): """ Main func. """ # args = docopt(__doc__) # Check Python & PyTorch Versions assert (sys.version_info >= (3, 5)), "Please update your installation of Python to version >= 3.5" assert ( torch.__version__ == "1.0.0" ), "Please update your installation of PyTorch. You have {} and you should have version 1.0.0".format( torch.__version__) # Seed the Random Number Generators seed = 1234 torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed * 13 // 7) vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # Create NMT Model model = NMT(embed_size=EMBED_SIZE, hidden_size=HIDDEN_SIZE, dropout_rate=DROPOUT_RATE, vocab=vocab) char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder(hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) # question_1f_sanity_check() question_2d_sanity_check(decoder)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__( self, embed_size, hidden_size, vocab, weights, no_char_decoder=False, dropout_rate=0.2, ): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_word_embeddings = ModelWordEmbeddings( embed_size, vocab, weights) self.model_char_embeddings_source = ModelCharEmbeddings(50, vocab.src) self.model_char_embeddings_target = ModelCharEmbeddings(50, vocab.src) # we set the embed_size = 2 * embed_size self.d = embed_size + 50 # hidden_size = embed_size + 50 self.highway = Highway(self.d) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout # print(embed_size + hidden_size) self.encoder = nn.LSTM(self.d, hidden_size, bidirectional=True, bias=True) #changed bias=True self.decoder = nn.LSTMCell((50 + hidden_size), hidden_size, bias=True) self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.src), bias=False) self.dropout = nn.Dropout(p=self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.src) else: self.charDecoder = None ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.src.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) source_padded_chars = self.vocab.src.to_input_tensor_char( source, device=self.device) # Tensor: (src_len, b) target_padded_chars = self.vocab.src.to_input_tensor_char( target, device=self.device) # Tensor: (tgt_len, b) char_embs = self.model_char_embeddings_source(source_padded_chars) word_embs = self.model_word_embeddings.source(source_padded) embed = self.contextual_embedding_layer(word_embs, char_embs) # print(embed.shape) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(embed, source_lengths) # print(enc_hiddens.shape) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) # logits =self.target_vocab_projection(combined_outputs) # loss = nn.CrossEntropyLoss(reduction='sum',ignore_index=self.vocab.tgt['<pad>']) # xentropy_loss = loss(logits.permute(0,2,1),target_padded[1:]) # return xentropy_loss P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.src['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].view(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def contextual_embedding_layer( self, p_word_source: torch.Tensor, p_char_source: torch.Tensor) -> torch.Tensor: #p_word_source # Highway Networks for 1. and 2. embd = torch.cat((p_word_source, p_char_source), -1) # (batch, max_sent_len, embd_size*2) embd = self.highway(embd) # (batch, max_sent_len, embd_size*2) return embd def encode( self, embeds: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute pack_padded_X = pack_padded_sequence(embeds, source_lengths) packed_enc_hiddens, (last_hidden, last_cell) = self.encoder(pack_padded_X) enc_hiddens, _ = pad_packed_sequence(packed_enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), 1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), 1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev. ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_char_embeddings_source( target_padded) #cause source_padded already a tensor. #major error for Y_t in torch.split(Y, 1): Y_t = torch.squeeze(Y_t) # post concat it should be (b,h+e) Ybar_t = torch.cat((Y_t, o_prev), 1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, 0) ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t. ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = torch.squeeze( torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2)), 2) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. #$$ Hints: ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh alpha_t = nn.functional.softmax(e_t, 1) a_t = torch.squeeze( torch.bmm(torch.unsqueeze(alpha_t, 1), enc_hiddens), 1) U_t = torch.cat((dec_hidden, a_t), 1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ ## A4 code # src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) ## End A4 code src_char_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_word_sents_var = self.vocab.src.to_input_tensor( [src_sent], device=self.device) # Tensor: (src_len, b) char_embs = self.model_char_embeddings_source(src_char_sents_var) word_embs = self.model_word_embeddings.source(src_word_sents_var) embd = self.contextual_embedding_layer(word_embs, char_embs) src_encodings, dec_init_vec = self.encode(embd, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) ## A4 code # y_tm1 = self.vocab.tgt.to_input_tensor(list([hyp[-1]] for hyp in hypotheses), device=self.device) # y_t_embed = self.model_embeddings_target(y_tm1) ## End A4 code y_tm1 = self.vocab.src.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_char_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] #[:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses def beam_search_incorrect( self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ # Convert list of lists into tensors src_char_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_word_sents_var = self.vocab.src.to_input_tensor( [src_sent], device=self.device) # Tensor: (src_len, b) char_embs = self.model_char_embeddings_source(src_char_sents_var) word_embs = self.model_word_embeddings.source(src_word_sents_var) embd = self.contextual_embedding_layer(word_embs, char_embs) src_encodings, dec_init_vec = self.encode(embd, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.src['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) # y_tm1 = torch.tensor([self.vocab.src[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) # y_t_embed = self.model_word_embeddings.source(y_tm1) y_tm1 = self.vocab.src.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_char_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.src) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.src) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.src.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_word_embeddings.source.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=False, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_word_embeddings.embed_size, hidden_size=self.hidden_size, weights=self.model_word_embeddings.source.weight, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 self.encoder = nn.LSTM(word_embed_size, hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(word_embed_size + hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # Wh self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # Wc self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 3, hidden_size, bias=False) # Wu self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) # Wvocab self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of one number representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors # source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) ### YOUR CODE HERE for part 1i ### TODO: ### Modify the code lines above as needed to fetch the character-level tensor ### to feed into encode() and decode(). You should: ### - Keep `target_padded` from A4 code above for predictions target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### - Add `source_padded_chars` for character level padded encodings for source source_padded_chars = self.vocab.src.to_input_tensor_char( source, self.device) # tensor of (max_sentence_length, bs, max_word_length) ### - Add `target_padded_chars` for character level padded encodings for target target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, self.device) ### - Modify calls to encode() and decode() to use the character level encodings enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) ### END YOUR CODE P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: # note that max_sentence_length is the number of words for the longest sentence # max_word_length is the number of characters for the longest word (of the entire batch) # and bs is the number of sentences in 1 batch # target_padded_chars: tensor of (max_sentence_length, bs, max_word_length) max_word_len = target_padded_chars.shape[-1] # target_padded: tensor of (max_sent_length, bs) target_words = target_padded[1:].contiguous().view( -1) # (max_sent_length * bs), this is not used target_chars = target_padded_chars[1:].view( -1, max_word_len) # (max_sent_length * bs, max_word_length). # Note that we skip the first word of each sentence, which is a <sentence start> token # combined_outputs: tensor shape (tgt_len, bs, hidden size) target_outputs = combined_outputs.view( -1, 256) # (max_sent_length * bs, hidden size) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t().contiguous(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. X = self.model_embeddings_source( source_padded) #(sen_len,bs,wordemb_sz) ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. X = pack_padded_sequence( X, source_lengths ) # since we pad each sentence, we need to call this to pack them into tensor enc_hiddens, (last_hidden, last_cell) = self.encoder(X) # foir LSTM, if (h_0, c_0) is not provided, both h_0 and c_0 default to zero. # enc_hiddens: (sen,bs,h*2) # last_hidden: (2 b/c biLSTM,bs,h) # last_cell: (2,bs,h) ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. enc_hiddens, _ = pad_packed_sequence(enc_hiddens) # (sen_len,bs,h*2) ### - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to ### return a tensor of shape (bs, src_len, h*2) as `enc_hiddens`. enc_hiddens = enc_hiddens.permute(1, 0, 2) #(bs, src_len, h*2) ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size last_hidden = torch.cat((last_hidden[0], last_hidden[1]), dim=1) # (2,b,h) -> (b,h*2) init_decoder_hidden = self.h_projection(last_hidden) ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards ### and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size last_cell = torch.cat((last_cell[0], last_cell[1]), dim=1) init_decoder_cell = self.c_projection(last_cell) dec_init_state = init_decoder_hidden, init_decoder_cell ### END YOUR CODE FROM ASSIGNMENT 4 return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zeros batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. enc_hiddens_proj = self.att_projection(enc_hiddens) # (b, src_len, h) ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. Y = self.model_embeddings_target( target_padded) # (tgt_len,bs,wordemb_sz) ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. for Y_t in torch.split( Y, 1, dim=0): # same as looping through 1st dimension of this tensor Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat([Y_t, o_prev], dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### combined_outputs = torch.stack(combined_outputs, dim=0) ### END YOUR CODE FROM ASSIGNMENT 4 return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. dec_state = self.decoder(Ybar_t, dec_state) ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) dec_hidden, dec_cell = dec_state ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!) ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### e_t = enc_hiddens_proj.bmm(dec_hidden.unsqueeze(2)).squeeze(2) # (bs,1,src_len) @ (bs,src_len,2h) = (bs,1,2h) = (bs,2h) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 # So that when do softmax on these paddings of this sentence, the attribution score will be 0 (e^-inf = 0) # example: sentence [il,a,m,entarte,<PAD>] (max source length = 5) will have enc_masks [0,0,0,0,1] # and with attribution score (pre_softmax) e_t such as [3,-1,0,-2,5], # 5 will be such a high attribution score for a meaningless padding # so we need to neutralize it by applying mask on so that e_t will be [3,-1,0,-2,-inf] if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### 1. Apply softmax to e_t to yield alpha_t alpha_t = F.softmax(e_t, dim=1) # (bs, src_len) ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### # att_view = (alpha_t.size(0), 1, alpha_t.size(1)) # a_t = torch.bmm(alpha_t.view(*att_view), enc_hiddens).squeeze(1) # (b,2h,src_len) @ (b,src_len,1) a_t = enc_hiddens.permute(0, 2, 1).bmm(alpha_t.unsqueeze(2)).squeeze(2) ### 3. Concatenate dec_hidden with a_t to compute tensor U_t U_t = torch.cat([dec_hidden, a_t], dim=1) ### 4. Apply the combined output projection layer to U_t to compute tensor V_t V_t = self.combined_output_projection(U_t) ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] # [:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(word_embed_size=self.model_embeddings_source.word_embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, spectrum_cnn_kernel_size=3, location_attention_window=64, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() # self.voiceCNN = VoiceCNN(embed_size, 5) self.location_attention_window = location_attention_window self.spectrum_cnn_kernel_size = spectrum_cnn_kernel_size self.spectrumCNN = nn.Conv1d(embed_size, embed_size, self.spectrum_cnn_kernel_size) # self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.embed_size = embed_size self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = torch.nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.c_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.loc_window = 5 self.loc_att_projection = torch.nn.Linear(embed_size, 1, bias=False) self.loc_att_conv1D = nn.Conv1d(self.loc_window, embed_size, 1) self.att_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.combined_output_projection = torch.nn.Linear(3 * hidden_size, hidden_size, bias=False) self.target_vocab_projection = torch.nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) # END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Initalize the NMT Model. :param int embed_size: Embedding size (dimensionality) :param int hidden_size: Hidden Size (dimensionality) :param Vocab vocab: Vocabulary object containing src and tgt languages See vocab.py for documentation. :param float dropout_rate: Dropout probability, for the attention combination layer """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) # Need to feed in transpose of [h_enc(1)(<-) ; h_enc(m)(->)], and output is 1xh self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of [c_enc(1)(<-); c_enc(m)(->)], and output is 1xh self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of u(t), and output is 1xh (v(t)) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of o(t), and output is 1x|Vtg| self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. :param List[List[str]] source: list of source sentence tokens :param List[List[str]] target: list of target sentence tokens, wrapped by `<s>` and `</s>` :return Tensor: a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) source_padded_chars = self.vocab.src.to_input_tensor_char(source, device=self.device) target_padded_chars = self.vocab.tgt.to_input_tensor_char(target, device=self.device) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) # Compute the softmax scores for all hidden states from the decoder (all in the batch, including masked ones) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text (we get zeros for pad tokens) target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words (ignoring the start token) target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze( -1) * target_masks[1:] scores = target_gold_words_log_prob.sum() if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].reshape(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward(target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> \ Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. :param Tensor source_padded: Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length (already sorted in order of longest to shortest sentence). :param List[int] source_lengths: List of actual lengths for each of the source sentences in the batch. :return Tensor: Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. :return tuple(Tensor, Tensor): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings_source(source_padded) X = nn.utils.rnn.pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) enc_hiddens, _ = nn.utils.rnn.pad_packed_sequence(enc_hiddens, batch_first=True) init_decoder_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) init_decoder_hidden = self.h_projection(init_decoder_hidden) init_decoder_cell = torch.cat((last_cell[0], last_cell[1]), 1) init_decoder_cell = self.c_projection(init_decoder_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. :param Tensor enc_hiddens: Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. :param Tensor enc_masks: Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. :param tuple(Tensor, Tensor) dec_init_state: Initial state and cell for decoder :param Tensor target_padded: Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. :return Tensor: combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Remove the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step (output of each decoder step) combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, 1, 0): Y_t = torch.squeeze(Y_t, dim=0) Ybar_t = torch.cat((Y_t, o_prev), dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, dim=0) return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. :param Tensor Ybar_t: Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. :param tuple(Tensor, Tensor) dec_state: Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. :param Tensor enc_hiddens: Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. :param Tensor enc_hiddens_proj: Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is of shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. :param Tensor enc_masks: Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. :return tuple(Tensor, Tensor): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. :return Tensor: Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. :return Tensor: Tensor of shape (b, src_len). It is attention scores distribution. """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state batch2 = torch.unsqueeze(dec_hidden, 2) e_t = torch.bmm(enc_hiddens_proj, batch2) e_t = torch.squeeze(e_t, dim=2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) alpha_t = nn.functional.softmax(e_t, 1) alpha_t = torch.unsqueeze(alpha_t, dim=1) a_t = torch.bmm(alpha_t, enc_hiddens) a_t = torch.squeeze(a_t, dim=1) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. :param Tensor enc_hiddens: encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. :param List[int] source_lengths: List of actual lengths for each of the sentences in the batch. :return Tensor: Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[ Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. :param List[str] src_sent: a single source sentence (words) :param int beam_size: beam size :param int max_decoding_time_step: maximum number of time steps to unroll the decoding RNN :return List[Hypothesis]: a list of hypothesis, each hypothesis has two fields: value List[str]: the decoded target sentence, represented as a list of words score float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list([hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoder_states_for_unks = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoder_states_for_unks)) decoder_states_for_unks.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoder_states_for_unks) > 0 and self.charDecoder is not None: # decode UNKs decoder_states_for_unks = torch.stack(decoder_states_for_unks, dim=0) decoded_words = self.charDecoder.decode_greedy((decoder_states_for_unks.unsqueeze(0), decoder_states_for_unks.unsqueeze(0)), max_length=21, device=self.device) assert len(decoded_words) == decoder_states_for_unks.size()[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decoded_words[int(hyp[-1][5:])] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. :param str model_path: path to model :param boolean no_char_decoder: whether the char-level decoder is also used """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the model to a file. :param str path: path to the model parameters """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
class DPPNMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False, nmt_model=None): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention @param nmt_model (NMT): a5 NMT Model (without DPP) to initialize layers with """ super(DPPNMT, self).__init__() if nmt_model is None: self.model_embeddings_source = ModelEmbeddings( embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings( embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.embed_size = embed_size self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None else: self.model_embeddings_source = nmt_model.model_embeddings_source self.model_embeddings_target = nmt_model.model_embeddings_target self.hidden_size = nmt_model.hidden_size self.dropout_rate = nmt_model.dropout_rate self.vocab = nmt_model.vocab self.embed_size = nmt_model.model_embeddings_source.embed_size self.encoder = nmt_model.encoder self.decoder = nmt_model.decoder self.h_projection = nmt_model.h_projection self.c_projection = nmt_model.c_projection self.att_projection = nmt_model.att_projection self.combined_output_projection = nmt_model.combined_output_projection self.target_vocab_projection = nmt_model.target_vocab_projection self.dropout = nmt_model.dropout self.charDecoder = nmt_model.charDecoder def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) source_padded_chars = self.vocab.src.to_input_tensor_char( source, device=self.device) target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, device=self.device) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].contiguous().view( -1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars #torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs #torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings_source(source_padded) X_packed = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X_packed) (enc_hiddens, _) = pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), dim=1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), dim=1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, split_size_or_sections=1): Y_t = Y_t.squeeze(0) Ybar_t = torch.cat([Y_t, o_prev], dim=-1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs) return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(2)).squeeze(2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) alpha_t = F.softmax(e_t, dim=-1) alpha_t_view = (alpha_t.size(0), 1, alpha_t.size(1)) a_t = torch.bmm(alpha_t.view(*alpha_t_view), enc_hiddens).squeeze(1) U_t = torch.cat([dec_hidden, a_t], 1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: if PRINT_HYPOTHESIS_TREE: print(sorted(hypotheses)) t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) if TOGGLE_PRINT: print("att_tm1", att_tm1.shape) # num_hyps x target_embed_size print("y_t_embed", y_t_embed.shape) print("x", x.shape) print("h_tm1", h_tm1[0].shape, h_tm1[1].shape) # same as x print("h_t", h_t.shape) print("cell_t", cell_t.shape) print("att_t", att_t.shape) print(hypotheses) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) ###### START TOP K HERE ####### # top_cand_hyp_scores, top_cand_hyp_pos = self.topk(contiuating_hyp_scores, live_hyp_num) ###### END TOP K HERE ####### ###### START DPP HERE ####### top_cand_hyp_scores, top_cand_hyp_pos = self.kdpp( att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size, ) if TOGGLE_PRINT: top_cand_hyp_scores_topk, top_cand_hyp_pos_topk = self.topk( contiuating_hyp_scores, live_hyp_num) print('topk', top_cand_hyp_scores_topk) print('kdpp', top_cand_hyp_scores) print('topk', top_cand_hyp_pos_topk) print('kdpp', top_cand_hyp_pos) #### END DPP HERE #### prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] #[:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) if PRINT_HYPOTHESES: print(completed_hypotheses) print("**********************") return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] nmt_model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) nmt_model.load_state_dict(params['state_dict']) model = DPPNMT(nmt_model=nmt_model, vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings_source.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path) def timer(self, message=None): if PRINT_TIMER: if message is None or not hasattr( self, "last_time") or self.last_time is None: self.last_time = time.time() else: new_time = time.time() print("%s: %f" % (message, new_time - self.last_time)) self.last_time = new_time def topk(self, contiuating_hyp_scores, live_hyp_num): top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) return top_cand_hyp_scores, top_cand_hyp_pos def word_embeddings(self): if not hasattr(self, "word_embeddings_cached"): self.timer() words = [[self.vocab.tgt.id2word[id]] for id in range(len(self.vocab.tgt.word2id))] words_char_tensor = self.vocab.tgt.to_input_tensor_char( words, device=self.device) self.word_embeddings_cached = self.model_embeddings_target( words_char_tensor).squeeze(0) if TOGGLE_PRINT: print("embeddings", embeddings.shape) self.timer("Embeddings") return self.word_embeddings_cached def kdpp(self, att_t, src_encodings, src_encodings_att_linear, h_t, cell_t, contiuating_hyp_scores, live_hyp_num, beam_size): # for every element in contiuating_hyp_scores, I need to get the target # word embedding, take another step, get that output, normalize, and multiply by # the corresponding element of log_p_t # TODO: need to duplicate each num_hyps times self.timer() #top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=INITIAL_SAMPLE_SIZE_RATIO * beam_size) self.timer("topk") vocab_size = len(self.vocab.tgt.word2id) num_hyps, embed_size = att_t.shape # TODO: minimize data movement # print("x", x.shape) # att_t_repeated = att_t.repeat(1, vocab_size).view(-1, embed_size) # embeddings_repeated = embeddings.repeat(1, vocab_size).view(-1, embed_size) # x = torch.cat([embeddings_repeated, att_t_repeated], dim=-1) # x = x[top_cand_hyp_pos] embeddings = self.word_embeddings() # print(top_cand_hyp_pos) x_list = [] for hyp_pos in top_cand_hyp_pos: emb_hyp = embeddings[hyp_pos % vocab_size] att_hyp = att_t[hyp_pos / vocab_size] x_partial = torch.cat([emb_hyp, att_hyp]) x_list.append(x_partial.unsqueeze(0)) x = torch.cat(x_list, dim=0) self.timer("newx") batch_size = x.shape[0] new_exp_src_encodings = src_encodings.expand(batch_size, src_encodings.size(1), src_encodings.size(2)) new_exp_src_encodings_att_linear = src_encodings_att_linear.expand( batch_size, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) # Might have to stretch h_t, and cell_t # new_h_t = h_t.repeat(1, vocab_size).view(-1, embed_size) # new_cell_t = cell_t.repeat(1, vocab_size).view(-1, embed_size) # new_h_t = new_h_t[top_cand_hyp_pos] # new_cell_t = new_cell_t[top_cand_hyp_pos] self.timer() new_h_t_list = [] new_cell_t_list = [] for hyp_pos in top_cand_hyp_pos: h_t_hyp = h_t[hyp_pos / vocab_size] cell_t_hyp = cell_t[hyp_pos / vocab_size] new_h_t_list.append(h_t_hyp.unsqueeze(0)) new_cell_t_list.append(cell_t_hyp.unsqueeze(0)) new_h_t = torch.cat(new_h_t_list, dim=0) new_cell_t = torch.cat(new_cell_t_list, dim=0) self.timer("new_h_t/cell_t") (h_t_dpp, _), _, _ = self.step(x, (new_h_t, new_cell_t), new_exp_src_encodings, new_exp_src_encodings_att_linear, enc_masks=None) self.timer("step") # num_hyps = len(contiuating_hyp_scores.shape[0])/len(self.vocab.tgt) norms = torch.norm(h_t_dpp, p=2, dim=1, keepdim=True) #if norms.is_cuda: # norms = norms.cpu() unit_vectors = h_t_dpp.div(norms.expand_as(h_t_dpp)) # new_p_t = log_p_t.repeat(1, vocab_size).view(-1, vocab_size) # print("new_p_t", log_p_t.shape) # TODO: this returns e^{scores}... correct? quality_scores = torch.exp( top_cand_hyp_scores.unsqueeze(1)).expand_as(unit_vectors) # TODO: maybe normalize the quality_scores? quality_scores = torch.pow(quality_scores, 1 / 2) features = unit_vectors * quality_scores self.timer("scores") L = torch.mm(features, features.t()).cpu() self.timer("L") try: new_top_cand_hyp_pos = sample_k_dpp(L, k=live_hyp_num) except Exception as e: print("Error sampling from L, falling back to top k: %s" % e) return self.topk(contiuating_hyp_scores, live_hyp_num) if ADD_TOP_N > 0: new_top_cand_hyp_pos = np.unique( np.append(new_top_cand_hyp_pos, range(ADD_TOP_N))) self.timer("sample_k_dpp") top_cand_hyp_pos = top_cand_hyp_pos[new_top_cand_hyp_pos] # top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos].squeeze(0) top_cand_hyp_scores = contiuating_hyp_scores[top_cand_hyp_pos] scores1, pos1 = self.topk(contiuating_hyp_scores, live_hyp_num) # print('topk pos', pos1) # print('top_cand_hyp_pos', top_cand_hyp_pos) # print('topk scores', scores1) # print('top_cand_hyp_pos', top_cand_hyp_scores) if TOGGLE_PRINT: print("vocab size", vocab_size) print("att_t_repeated", att_t_repeated.shape) print("top_cand_hyp_pos", top_cand_hyp_pos.shape) print("new_x", x.shape) print("src_encodings", new_exp_src_encodings.shape) print("src_encodings_att", new_exp_src_encodings_att_linear.shape) print("new_h_t", new_h_t.shape) print("new_cell_t", new_cell_t.shape) print("hidden", h_t_dpp.shape) print("norms", norms.shape) print("unit_vectors", unit_vectors.shape) print("L", L.shape) print("L", L) print("new_top_cand_hyp_pos", new_top_cand_hyp_pos) print(top_cand_hyp_pos) print("new_top_hyp_pos", top_cand_hyp_pos.shape) print("new_top_hyp_scores", top_cand_hyp_scores.shape) print('top chosen: ', new_top_cand_hyp_pos) return top_cand_hyp_scores, top_cand_hyp_pos
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False, nmt_model=None): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention @param nmt_model (NMT): a5 NMT Model (without DPP) to initialize layers with """ super(DPPNMT, self).__init__() if nmt_model is None: self.model_embeddings_source = ModelEmbeddings( embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings( embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.embed_size = embed_size self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None else: self.model_embeddings_source = nmt_model.model_embeddings_source self.model_embeddings_target = nmt_model.model_embeddings_target self.hidden_size = nmt_model.hidden_size self.dropout_rate = nmt_model.dropout_rate self.vocab = nmt_model.vocab self.embed_size = nmt_model.model_embeddings_source.embed_size self.encoder = nmt_model.encoder self.decoder = nmt_model.decoder self.h_projection = nmt_model.h_projection self.c_projection = nmt_model.c_projection self.att_projection = nmt_model.att_projection self.combined_output_projection = nmt_model.combined_output_projection self.target_vocab_projection = nmt_model.target_vocab_projection self.dropout = nmt_model.dropout self.charDecoder = nmt_model.charDecoder
def __init__( self, embed_size, hidden_size, vocab, weights, no_char_decoder=False, dropout_rate=0.2, ): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_word_embeddings = ModelWordEmbeddings( embed_size, vocab, weights) self.model_char_embeddings_source = ModelCharEmbeddings(50, vocab.src) self.model_char_embeddings_target = ModelCharEmbeddings(50, vocab.src) # we set the embed_size = 2 * embed_size self.d = embed_size + 50 # hidden_size = embed_size + 50 self.highway = Highway(self.d) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout # print(embed_size + hidden_size) self.encoder = nn.LSTM(self.d, hidden_size, bidirectional=True, bias=True) #changed bias=True self.decoder = nn.LSTMCell((50 + hidden_size), hidden_size, bias=True) self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.src), bias=False) self.dropout = nn.Dropout(p=self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.src) else: self.charDecoder = None
class DummyVocab(): def __init__(self): self.char2id = json.load(open('./sanity_check_en_es_data/char_vocab_sanity_check.json', 'r')) self.id2char = {id: char for char, id in self.char2id.items()} self.char_unk = self.char2id['<unk>'] self.start_of_word = self.char2id["{"] self.end_of_word = self.char2id["}"] char_vocab = DummyVocab() # Initialize CharDecoder decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) class TestEverything(unittest.TestCase): def setUp(cls): # Initialize CharDecoder cls.decoder = CharDecoder( hidden_size=HIDDEN_SIZE, char_embedding_size=EMBED_SIZE, target_vocab=char_vocab) cls.vocab = Vocab.load('./sanity_check_en_es_data/vocab_sanity_check.json') # cl NMT Model cls.model = NMT(
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bidirectional=True, bias=True) self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) self.dropout = nn.Dropout(dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of one number representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors # source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### YOUR CODE HERE for part 1i ### TODO: ### Modify the code lines above as needed to fetch the character-level tensor ### to feed into encode() and decode(). You should: ### - Keep `target_padded` from A4 code above for predictions ### - Add `source_padded_chars` for character level padded encodings for source ### - Add `target_padded_chars` for character level padded encodings for target ### - Modify calls to encode() and decode() to use the character level encodings ### END YOUR CODE # 3 lines need to write source_padded_chars = self.vocab.src.to_input_tensor_char( source, device=self.device) target_padded_chars = self.vocab.tgt.to_input_tensor_char( target, device=self.device) enc_hiddens, dec_init_state = self.encode(source_padded_chars, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded_chars) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum( ) # mhahn2 Small modification from A4 code. if self.charDecoder is not None: max_word_len = target_padded_chars.shape[-1] target_words = target_padded[1:].contiguous().view(-1) target_chars = target_padded_chars[1:].view(-1, max_word_len) target_outputs = combined_outputs.view(-1, 256) target_chars_oov = target_chars # torch.index_select(target_chars, dim=0, index=oovIndices) rnn_states_oov = target_outputs # torch.index_select(target_outputs, dim=0, index=oovIndices) oovs_losses = self.charDecoder.train_forward( target_chars_oov.t().contiguous(), (rnn_states_oov.unsqueeze(0), rnn_states_oov.unsqueeze(0))) scores = scores - oovs_losses return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b, max_word_length), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### Except replace "self.model_embeddings.source" with "self.model_embeddings_source" X = self.model_embeddings_source( source_padded) # (sentence_length, batch_size, word_embed_size) X = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) enc_hiddens, _ = pad_packed_sequence( enc_hiddens, padding_value=self.vocab.src['<pad>'], batch_first=True) init_decoder_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) init_decoder_hidden = self.h_projection(init_decoder_hidden) init_decoder_cell = torch.cat((last_cell[0], last_cell[1]), 1) init_decoder_cell = self.c_projection(init_decoder_cell) dec_init_state = (init_decoder_hidden, init_decoder_cell) ### END YOUR CODE FROM ASSIGNMENT 4 return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b, max_word_length), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zeros batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 ### Except replace "self.model_embeddings.target" with "self.model_embeddings_target" enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings_target(target_padded) for Y_t in torch.split(Y, 1, dim=0): Y_t = Y_t.squeeze(0) Ybar_t = torch.cat((Y_t, o_prev), dim=1) dec_state, o_t, _ = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, dim=0) ### END YOUR CODE FROM ASSIGNMENT 4 return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state e_t = torch.bmm(enc_hiddens_proj, dec_hidden.unsqueeze(-1)).squeeze(-1) ### END YOUR CODE FROM ASSIGNMENT 4 # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 alpha_t = F.softmax(e_t, dim=1) a_t = torch.bmm(alpha_t.unsqueeze(1), enc_hiddens).squeeze(1) U_t = torch.cat((a_t, dec_hidden), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE FROM ASSIGNMENT 4 combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor_char([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = self.vocab.tgt.to_input_tensor_char(list( [hyp[-1]] for hyp in hypotheses), device=self.device) y_t_embed = self.model_embeddings_target(y_tm1) y_t_embed = torch.squeeze(y_t_embed, dim=0) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos // len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] decoderStatesForUNKsHere = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] # Record output layer in case UNK was generated if hyp_word == "<unk>": hyp_word = "<unk>" + str(len(decoderStatesForUNKsHere)) decoderStatesForUNKsHere.append(att_t[prev_hyp_id]) new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(decoderStatesForUNKsHere ) > 0 and self.charDecoder is not None: # decode UNKs decoderStatesForUNKsHere = torch.stack( decoderStatesForUNKsHere, dim=0) decodedWords = self.charDecoder.decode_greedy( (decoderStatesForUNKsHere.unsqueeze(0), decoderStatesForUNKsHere.unsqueeze(0)), max_length=21, device=self.device) assert len(decodedWords) == decoderStatesForUNKsHere.size( )[0], "Incorrect number of decoded words" for hyp in new_hypotheses: if hyp[-1].startswith("<unk>"): hyp[-1] = decodedWords[int(hyp[-1][5:])] # [:-1] if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.att_projection.weight.device @staticmethod def load(model_path: str, no_char_decoder=False): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], no_char_decoder=no_char_decoder, **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(word_embed_size=self.model_embeddings_source.word_embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 # This attribute added by TaoJian self.word_embed_size = word_embed_size # YOUR CODE HERE (~8 Lines) # TODO - Initialize the following variables: # self.encoder (Bidirectional LSTM with bias) # self.decoder (LSTM Cell with bias) # self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. # self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. # self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. # self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. # self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. # self.dropout (Dropout Layer) ### # Use the following docs to properly initialize these variables: # LSTM: # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM # LSTM Cell: # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell # Linear Layer: # https://pytorch.org/docs/stable/nn.html#torch.nn.Linear # Dropout Layer: # https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(input_size=self.word_embed_size, hidden_size=self.hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(self.word_embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, self.vocab.tgt.__len__(), bias=False) self.dropout = nn.Dropout(self.dropout_rate, inplace=False) # END YOUR CODE ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None