def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # Bidirectional LSTM with bias self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, bidirectional=True, bias=True) # LSTM Cell with bias self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size, hidden_size=self.hidden_size, bias=True) # Linear Layer with no bias, W_{h} self.h_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{c} self.c_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{attProj} self.att_projection = nn.Linear(in_features=self.hidden_size * 2, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{u} self.combined_output_projection = nn.Linear( in_features=self.hidden_size * 3, out_features=self.hidden_size, bias=False) # Linear Layer with no bias, W_{vocab} self.target_vocab_projection = nn.Linear(in_features=self.hidden_size, out_features=len(vocab.tgt), bias=False) # Dropout Layer self.dropout = nn.Dropout(p=self.dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 self.encoder = torch.nn.LSTM( input_size=embed_size, hidden_size=self.hidden_size, bias=True, bidirectional=True, ) self.decoder = torch.nn.LSTMCell( input_size=embed_size + hidden_size, hidden_size=self.hidden_size, bias=True, ) self.h_projection = torch.nn.Linear( in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False ) self.c_projection = torch.nn.Linear( in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False ) self.att_projection = torch.nn.Linear( in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False ) self.combined_output_projection = torch.nn.Linear( in_features=3 * self.hidden_size, out_features=self.hidden_size, bias=False ) self.target_vocab_projection = torch.nn.Linear( in_features=self.hidden_size, out_features=len(self.vocab.tgt), bias=False ) self.dropout = torch.nn.Dropout(p=self.dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (VocabEntry): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) # print(self.model_embeddings_source.parameter_counter) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=1, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size, hidden_size=hidden_size, bias=True) self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(self.vocab.tgt), bias=False) self.dropout = nn.Dropout2d(p=dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, num_layers=1, bias=True, batch_first=false, dropout=self.dropout_rate, bidirectional=True) self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size, hidden_size=self.hidden_size, bias=True) #input should be hidden_size (from encoder)+Embed of output language Self.h_projection = nn.Linear(in_features=2*self.hidden_size, out_features=self.hidden_size) self.c_projection = nn.linear(in_features=2*self.hidden_size, out_features=self.hidden_size) self.att_projection = nn.linear(in_features=2*self.hidden_size, out_features=self.hidden_size) self.combined_output_projection = nn.linear(in_features=3*self.hidden_size, out_features=self.hidden_size) self.target_vocab_projection = nn.linear(in_features=self.hidden_size, out_features=self.model_embeddings.target.shape[0]) self.dropout = nn.Dropout(drop=self.dropout_rate , impulse=False)
def __init__(self, embed_size, hidden_size, src_vocab: Vocabulary, dst_vocab: Vocabulary, device, dropout_rate=0.2): super(NMT, self).__init__() self.device = device self.model_embeddings = ModelEmbeddings(embed_size, src_vocab, dst_vocab) self.hidden_size = hidden_size self.src_vocab = src_vocab self.dst_vocab = dst_vocab self.dropout_rate = dropout_rate # encoder是双向LSTM,有bias self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bidirectional=True, dropout=dropout_rate, bias=True) # decoder是单向LSTM,有bias self.decoder = nn.LSTMCell( input_size=embed_size + hidden_size, # input-feeding方法:将注意力向量和下一个时间步的输入连接在一起,使模型在做对齐决策时,也会考虑过去的对齐信息 hidden_size=hidden_size, bias=True) # h_projection, c_projection分别是src对decoder状态和cell的初始化 self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # att_projection是src对decoder隐空间的映射(到context vector) self.att_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False) # attention向量和下个时间步的输入连接在一起输入decoder self.combined_output_projection = nn.Linear(hidden_size * 2 + hidden_size, hidden_size, bias=False) # decoder神经网络的输入到vocab的映射 self.target_vocab_projection = nn.Linear(hidden_size, len(dst_vocab), bias=False) self.dropout = nn.Dropout(dropout_rate)
def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3, no_char_decoder=False): """ Init NMT Model. @param word_embed_size (int): Embedding size (dimensionality) of word @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bidirectional=True, bias=True) self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) # END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) # src&tgt Embedding Init self.hidden_size = hidden_size # hidden size self.dropout_rate = dropout_rate # Dropout self.vocab = vocab # # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(embed_size, hidden_size, bias=True, bidirectional=True) self.decoder = nnn.LSTMCell(embed_size+hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate)
def load_dev_data(embed_size=50, dev_perct=1., binary=False): M = ModelEmbeddings(embed_size=embed_size) X = [ labeledTree.to_labeled_lines()[0][1].split(" ") for labeledTree in data['dev'] ] Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['dev']] if binary: X = [x for (x, y) in list(zip(X, Y)) if y != 3] Y = [1 if y > 3 else 0 for y in Y if y != 3] dev_size = int(len(X) * dev_perct) X = X[:dev_size] Y = Y[:dev_size] X = M.embed_sentence(X) # dev data doesn't need to be augmented, hence it's already zipped and # ready to be passed into model.forward() return list(zip(X, Y))
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### ### YOUR CODE HERE (~8 Lines) ### self.embed_size = embed_size self.encoder = nn.LSTM(self.embed_size, self.hidden_size, bias=True, bidirectional=True) # do i have to do make self.embed_size? Also I think since bidirectional is specified I don't need to use 2*self.hidden_size self.decoder = nn.LSTMCell(self.hidden_size + embed_size, self.hidden_size, bias=True) # need input size, hidden size. I think they are the same, except that for the input you concatenate the embedding for the current word. self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_h self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_c self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_attProj Not sure about this one; it seems to actually take two inputs, h^dec_t to the left and h^enc_i to the right. self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False) # W_u self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # W_vocab. Is len(self.vocab.tgt) the length of the target vocab? that is what we want. self.dropout = nn.Dropout(self.dropout_rate) #Dropout layer. ### ### END YOUR CODE ### '''TODO - Initialize the following variables:
def __init__(self, vocab, embed_size, hidden_size, output_size, batch_size, dropout_rate=0.2): super(LSTMClassifier, self).__init__() self.embed_size = embed_size self.hidden_size = hidden_size self.batch_size = batch_size self.embedding = ModelEmbeddings(vocab, embed_size) self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=False) self.proj = nn.Linear(hidden_size, output_size, bias=True) self.dropout = nn.Dropout(dropout_rate) self.softmax = nn.LogSoftmax(dim=1) self.hidden = self.init_hidden()
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Initalize the NMT Model. :param int embed_size: Embedding size (dimensionality) :param int hidden_size: Hidden Size (dimensionality) :param Vocab vocab: Vocabulary object containing src and tgt languages See vocab.py for documentation. :param float dropout_rate: Dropout probability, for the attention combination layer """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size) # Need to feed in transpose of [h_enc(1)(<-) ; h_enc(m)(->)], and output is 1xh self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of [c_enc(1)(<-); c_enc(m)(->)], and output is 1xh self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of u(t), and output is 1xh (v(t)) self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False) # Need to feed in transpose of o(t), and output is 1x|Vtg| self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(dropout_rate) if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, vocab, embed_size, hidden_size, enc_bidir, attn_size, dropout=0.2): super(QGModel, self).__init__() self.vocab = vocab self.args = { 'embed_size': embed_size, 'hidden_size': hidden_size, 'dropout': dropout, 'enc_bidir': enc_bidir, 'attn_size': attn_size } self.embeddings = ModelEmbeddings(embed_size, vocab) self.encoder = Encoder(embed_size, hidden_size, dropout, enc_bidir) self.decoder_init_hidden_proj = nn.Linear(self.encoder.hidden_size, hidden_size) self.decoder = Decoder(embed_size, hidden_size, attn_size, len(vocab.tgt), dropout)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ 初始化 NMT 模型. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): 词总述,包括 src 和 tgt @param dropout_rate (float): 对注意力的dropout概率 """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # 初始化各层次 # LSTM层 输入词嵌入,输出隐藏状态 self.encoder = nn.LSTM(embed_size, self.hidden_size, dropout=self.dropout_rate, bidirectional=True) # 可以选择双向 # LSTMCell 输入词嵌入与隐藏状态连接,输出隐藏状态 self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size) # 可以控制每个时间步 self.h_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.c_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) # 降维2h->h self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) # 降维3h->h self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # 输出投影到词库 self.dropout = nn.Dropout(p=self.dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate): super(Node2, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab print("vocab.num_labels: ", vocab.num_labels) self.num_labels = vocab.num_labels self.encoder0 = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, bias=True, # dropout=self.dropout_rate, bidirectional=True) self.encoder1 = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, bias=True, # dropout=self.dropout_rate, bidirectional=True) self.dropout1 = nn.Dropout() self.attention_projection = nn.Linear(in_features=2 * hidden_size, out_features=self.num_labels, bias=False) self.attention_softmax = nn.Softmax(dim=0) # self.labels_projection = nn.Linear(in_features=2*hidden_size, # out_features=1, # bias=False) self.labels_projection = nn.Linear(in_features=2 * hidden_size, out_features=100, bias=False) self.labels_projection2 = nn.Linear(in_features=100, out_features=1, bias=False)
def __init__(self, vocab, embed_size, embeddings, sim_scale=5): """ @param vocab (Vocab): vocab object @param embed_size (int): embedding size @param embeddings (torch.tensor (len(vocab), embed_dim)): pretrained word embeddings @param sim_scale (float): scale the sim score by this scalar """ super(AvgSim, self).__init__() self.pretrained_embeddings = embeddings self.embeddings = ModelEmbeddings(vocab, embed_size, self.pretrained_embeddings) self.vocab = vocab self.sim_scale = sim_scale self.scoring_fn = nn.CosineSimilarity(dim=-1)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None
def __init__(self, input_size, hidden_size, vocab, fasttext_model, device='cpu'): super(LSTMModel, self).__init__() self.hidden_size = hidden_size self.input_size = input_size self.vocab = vocab self.embedding = ModelEmbeddings(input_size, vocab, fasttext_model, device) self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True) self.linear = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=True) self.linear2 = nn.Linear(self.hidden_size, self.hidden_size, bias=True) self.attention = Attention(self.hidden_size)
def __init__(self, vocab, embed_size, embeddings, hidden_size, dropout_rate): """ @param vocab (Vocab): vocab object @param embed_size (int): embedding size @param embeddings (torch.tensor (len(vocab), embed_dim)): pretrained word embeddings @param hidden_size (int): hidden size @param dropout_rate (float): dropout prob """ super(NeuralModel, self).__init__() self.pretrained_embeddings = embeddings self.embeddings = ModelEmbeddings(vocab, embed_size, self.pretrained_embeddings) self.vocab = vocab self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.h_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.c_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_out_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.vocab_projection = nn.Linear(self.hidden_size, len(self.vocab), bias=False) self.dropout = nn.Dropout(self.dropout_rate) self.encoder = nn.LSTM(input_size=embed_size, hidden_size=self.hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size, hidden_size=self.hidden_size, bias=True)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate): super(Node, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab print("vocab.num_labels: ", vocab.num_labels) self.num_labels = vocab.num_labels # self.encoder = nn.LSTM(input_size=embed_size, # hidden_size=hidden_size, # bias=True, # # dropout=self.dropout_rate, # bidirectional=True) self.first_bilstm = BiLSTM(embed_size=embed_size, hidden_size=hidden_size, dropout_rate=dropout_rate, vocab=vocab) self.second_bilstm = BiLSTM(embed_size=embed_size, hidden_size=hidden_size, dropout_rate=dropout_rate, vocab=vocab)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(embed_size, self.hidden_size, bidirectional=True) self.decoder = nn.LSTMCell(self.hidden_size + embed_size, self.hidden_size) self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) self.dropout = nn.Dropout(self.dropout_rate) ### END YOUR CODE def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor( source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor( target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather( P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None ### YOUR CODE HERE (~ 8 Lines) ### TODO: ### 1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings. ### src_len = maximum source sentence length, b = batch size, e = embedding size. Note ### that there is no initial hidden state or cell for the decoder. ### 2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`. ### - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X. ### - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens. ### - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to ### return a tensor of shape (b, src_len, h*2) as `enc_hiddens`. ### 3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell): ### - `init_decoder_hidden`: ### `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the h_projection layer to this in order to compute init_decoder_hidden. ### This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size ### - `init_decoder_cell`: ### `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards. ### Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h). ### Apply the c_projection layer to this in order to compute init_decoder_cell. ### This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size ### ### See the following docs, as you may need to use some of the following functions in your implementation: ### Pack the padded sequence X before passing to the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence ### Pad the packed sequence, enc_hiddens, returned by the encoder: ### https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Permute: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute X = self.model_embeddings.source(source_padded) X = pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(X) enc_hiddens, _ = pad_packed_sequence(enc_hiddens) enc_hiddens = enc_hiddens.permute(1, 0, 2) init_decoder_hidden = self.h_projection( torch.cat((last_hidden[0], last_hidden[1]), 1)) init_decoder_cell = self.c_projection( torch.cat((last_cell[0], last_cell[1]), 1)) dec_init_state = (init_decoder_hidden, init_decoder_cell) # print(enc_hiddens.shape, init_decoder_cell.shape, init_decoder_hidden.shape) ### END YOUR CODE return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] ### YOUR CODE HERE (~9 Lines) ### TODO: ### 1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`, ### which should be shape (b, src_len, h), ### where b = batch size, src_len = maximum source length, h = hidden size. ### This is applying W_{attProj} to h^enc, as described in the PDF. ### 2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings. ### where tgt_len = maximum target sentence length, b = batch size, e = embedding size. ### 3. Use the torch.split function to iterate over the time dimension of Y. ### Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size. ### - Squeeze Y_t into a tensor of dimension (b, e). ### - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension ### - Use the step function to compute the the Decoder's next (cell, state) values ### as well as the new combined output o_t. ### - Append o_t to combined_outputs ### - Update o_prev to the new o_t. ### 4. Use torch.stack to convert combined_outputs from a list length tgt_len of ### tensors shape (b, h), to a single tensor shape (tgt_len, b, h) ### where tgt_len = maximum target sentence length, b = batch size, h = hidden size. ### ### Note: ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### You may find some of these functions useful: ### Zeros Tensor: ### https://pytorch.org/docs/stable/torch.html#torch.zeros ### Tensor Splitting (iteration): ### https://pytorch.org/docs/stable/torch.html#torch.split ### Tensor Dimension Squeezing: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tensor Stacking: ### https://pytorch.org/docs/stable/torch.html#torch.stack enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings.target(target_padded) for i in torch.split(Y, 1, dim=0): Y_t = i.squeeze(dim=0) Ybar_t = torch.cat((Y_t, o_prev), 1) dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs, dim=0) ### END YOUR CODE return combined_outputs def step( self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None ### YOUR CODE HERE (~3 Lines) ### TODO: ### 1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state. ### 2. Split dec_state into its two parts (dec_hidden, dec_cell) ### 3. Compute the attention scores e_t, a Tensor shape (b, src_len). ### Note: b = batch_size, src_len = maximum source length, h = hidden size. ### ### Hints: ### - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched) ### - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched). ### - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!) ### - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing. ### - When using the squeeze() function make sure to specify the dimension you want to squeeze ### over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1. ### ### Use the following docs to implement this functionality: ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor Unsqueeze: ### https://pytorch.org/docs/stable/torch.html#torch.unsqueeze ### Tensor Squeeze: ### https://pytorch.org/docs/stable/torch.html#torch.squeeze dec_state = self.decoder(Ybar_t, dec_state) (dec_hidden, dec_cell) = dec_state e_t = torch.squeeze(torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, dim=2)), dim=2) ### END YOUR CODE # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.bool(), -float('inf')) ### YOUR CODE HERE (~6 Lines) ### TODO: ### 1. Apply softmax to e_t to yield alpha_t ### 2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the ### attention output vector, a_t. ### - alpha_t is shape (b, src_len) ### - enc_hiddens is shape (b, src_len, 2h) ### - a_t should be shape (b, 2h) ### - You will need to do some squeezing and unsqueezing. ### Note: b = batch size, src_len = maximum source length, h = hidden size. ### ### 3. Concatenate dec_hidden with a_t to compute tensor U_t ### 4. Apply the combined output projection layer to U_t to compute tensor V_t ### 5. Compute tensor O_t by first applying the Tanh function and then the dropout layer. ### ### Use the following docs to implement this functionality: ### Softmax: ### https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax ### Batch Multiplication: ### https://pytorch.org/docs/stable/torch.html#torch.bmm ### Tensor View: ### https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view ### Tensor Concatenation: ### https://pytorch.org/docs/stable/torch.html#torch.cat ### Tanh: ### https://pytorch.org/docs/stable/torch.html#torch.tanh #print(e_t.shape) alpha_t = F.softmax(e_t, dim=1) a_t = torch.bmm(alpha_t.unsqueeze(dim=1), enc_hiddens).squeeze(1) U_t = torch.cat((dec_hidden, a_t), dim=1) V_t = self.combined_output_projection(U_t) O_t = self.dropout(torch.tanh(V_t)) ### END YOUR CODE combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int = 5, max_decoding_time_step: int = 70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses ) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand( hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor( [self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = ( hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk( contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip( prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append( Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append( Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # For sanity check only, not relevant to implementation self.gen_sanity_check = False self.counter = 0 ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(embed_size, self.hidden_size, bidirectional=True, bias=True) self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size, bias=True) self.h_projection = nn.Linear( self.hidden_size * 2, self.hidden_size, bias=False) # The final vector of hidden states self.c_projection = nn.Linear( self.hidden_size * 2, self.hidden_size, bias=False) # The final vector of cell states self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear( self.hidden_size, len(self.vocab.tgt), bias=False ) # We don't need embedding since we don't need to embed the output on low dimensions? self.dropout = nn.Dropout(self.dropout_rate)
class NMT(nn.Module): """ Simple Neural Machine Translation Model: - Bidrectional LSTM Encoder - Unidirection LSTM Decoder - Global Attention Model (Luong, et al. 2015) """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = nn.LSTM(embed_size, hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size, bias=True) self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False) self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False) self.target_vocab_projection = nn.Linear(len(vocab.tgt), hidden_size, bias=False) self.dropout = nn.Dropout() def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor: """ Take a mini-batch of source and target sentences, compute the log-likelihood of target sentences under the language models learned by the NMT system. @param source (List[List[str]]): list of source sentence tokens @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>` @returns scores (Tensor): a variable/tensor of shape (b, ) representing the log-likelihood of generating the gold-standard target sentence for each example in the input batch. Here b = batch size. """ # Compute sentence lengths source_lengths = [len(s) for s in source] # Convert list of lists into tensors source_padded = self.vocab.src.to_input_tensor(source, device=self.device) # Tensor: (src_len, b) target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device) # Tensor: (tgt_len, b) ### Run the network forward: ### 1. Apply the encoder to `source_padded` by calling `self.encode()` ### 2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()` ### 3. Apply the decoder to compute combined-output by calling `self.decode()` ### 4. Compute log probability distribution over the target vocabulary using the ### combined_outputs returned by the `self.decode()` function. enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths) enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths) combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded) P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1) # Zero out, probabilities for which we have nothing in the target text target_masks = (target_padded != self.vocab.tgt['<pad>']).float() # Compute log probability of generating true target words target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:] scores = target_gold_words_log_prob.sum(dim=0) return scores def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: """ Apply the encoder to source sentences to obtain encoder hidden states. Additionally, take the final states of the encoder and project them to obtain initial states for decoder. @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where b = batch_size, src_len = maximum source sentence length. Note that these have already been sorted in order of longest to shortest sentence. @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial hidden state and cell. """ enc_hiddens, dec_init_state = None, None X = self.model_embeddings.source(source_padded) packed_sequence = torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths) enc_hiddens, (last_hidden, last_cell) = self.encoder(packed_sequence) enc_hiddens = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens)[0] enc_hiddens = enc_hiddens.permute(1,0,2) concat_last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1) concat_last_cell = torch.cat((last_cell[0], last_cell[1]), 1) dec_init_state = (self.h_projection(concat_last_hidden), self.c_projection(concat_last_cell)) return enc_hiddens, dec_init_state def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor, dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor: """Compute combined output vectors for a batch. @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where b = batch size, src_len = maximum source sentence length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where b = batch size, src_len = maximum source sentence length. @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where tgt_len = maximum target sentence length, b = batch size. @returns combined_outputs (Tensor): combined output tensor (tgt_len, b, h), where tgt_len = maximum target sentence length, b = batch_size, h = hidden size """ # Chop of the <END> token for max length sentences. target_padded = target_padded[:-1] # Initialize the decoder state (hidden and cell) dec_state = dec_init_state # Initialize previous combined output vector o_{t-1} as zero batch_size = enc_hiddens.size(0) o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device) # Initialize a list we will use to collect the combined output o_t on each step combined_outputs = [] enc_hiddens_proj = self.att_projection(enc_hiddens) Y = self.model_embeddings.target(target_padded) for Y_t in torch.split(Y, 1): Y_t = torch.squeeze(Y_t) Ybar_t = torch.cat((Y_t, o_prev), 1) dec_state, o_t, et = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks) combined_outputs.append(o_t) o_prev = o_t combined_outputs = torch.stack(combined_outputs) return combined_outputs def step(self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor, torch.Tensor], enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor, enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]: """ Compute one forward step of the LSTM decoder, including the attention computation. @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder, where b = batch size, e = embedding size, h = hidden size. @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's prev hidden state, second tensor is decoder's prev cell. @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h), where b = batch size, src_len = maximum source length, h = hidden size. @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len), where b = batch size, src_len is maximum source length. @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size. First tensor is decoder's new hidden state, second tensor is decoder's new cell. @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size. @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution. Note: You will not use this outside of this function. We are simply returning this value so that we can sanity check your implementation. """ combined_output = None dec_state = self.decoder(Ybar_t, dec_state) dec_hidden, dec_cell = dec_state e_t = torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2)) e_t = torch.squeeze(e_t, 2) # Set e_t to -inf where enc_masks has 1 if enc_masks is not None: e_t.data.masked_fill_(enc_masks.byte(), -float('inf')) alpha_t = torch.nn.functional.softmax(e_t, 1) a_t = torch.squeeze(torch.bmm(torch.unsqueeze(alpha_t,1), enc_hiddens), 1) U_t = torch.cat((a_t, dec_hidden), 1) V_t = self.combined_output_projection(U_t) O_t = torch.tanh(V_t) # print("Ot", O_t) # O_t = self.dropout(O_t) # print("Ot", O_t) combined_output = O_t return dec_state, combined_output, e_t def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor: """ Generate sentence masks for encoder hidden states. @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size, src_len = max source length, h = hidden size. @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch. @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len), where src_len = max source length, h = hidden size. """ enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float) for e_id, src_len in enumerate(source_lengths): enc_masks[e_id, src_len:] = 1 return enc_masks.to(self.device) def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]: """ Given a single source sentence, perform beam search, yielding translations in the target language. @param src_sent (List[str]): a single source sentence (words) @param beam_size (int): beam size @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields: value: List[str]: the decoded target sentence, represented as a list of words score: float: the log-likelihood of the target sentence """ src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device) src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)]) src_encodings_att_linear = self.att_projection(src_encodings) h_tm1 = dec_init_vec att_tm1 = torch.zeros(1, self.hidden_size, device=self.device) eos_id = self.vocab.tgt['</s>'] hypotheses = [['<s>']] hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device) completed_hypotheses = [] t = 0 while len(completed_hypotheses) < beam_size and t < max_decoding_time_step: t += 1 hyp_num = len(hypotheses) exp_src_encodings = src_encodings.expand(hyp_num, src_encodings.size(1), src_encodings.size(2)) exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num, src_encodings_att_linear.size(1), src_encodings_att_linear.size(2)) y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device) y_t_embed = self.model_embeddings.target(y_tm1) x = torch.cat([y_t_embed, att_tm1], dim=-1) (h_t, cell_t), att_t, _ = self.step(x, h_tm1, exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None) # log probabilities over target words log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1) live_hyp_num = beam_size - len(completed_hypotheses) contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1) top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num) prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt) hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt) new_hypotheses = [] live_hyp_ids = [] new_hyp_scores = [] for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores): prev_hyp_id = prev_hyp_id.item() hyp_word_id = hyp_word_id.item() cand_new_hyp_score = cand_new_hyp_score.item() hyp_word = self.vocab.tgt.id2word[hyp_word_id] new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word] if hyp_word == '</s>': completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1], score=cand_new_hyp_score)) else: new_hypotheses.append(new_hyp_sent) live_hyp_ids.append(prev_hyp_id) new_hyp_scores.append(cand_new_hyp_score) if len(completed_hypotheses) == beam_size: break live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device) h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids]) att_tm1 = att_t[live_hyp_ids] hypotheses = new_hypotheses hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device) if len(completed_hypotheses) == 0: completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:], score=hyp_scores[0].item())) completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True) return completed_hypotheses @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.source.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = NMT(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None # YOUR CODE HERE (~8 Lines) # TODO - Initialize the following variables: # self.encoder (Bidirectional LSTM with bias) # self.decoder (LSTM Cell with bias) # self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. # self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. # self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. # self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. # self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. # self.dropout (Dropout Layer) ### # Use the following docs to properly initialize these variables: # LSTM: # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM # LSTM Cell: # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell # Linear Layer: # https://pytorch.org/docs/stable/nn.html#torch.nn.Linear # Dropout Layer: # https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout self.encoder = nn.LSTM(embed_size, self.hidden_size, bias=True, bidirectional=True) self.decoder = nn.LSTMCell( embed_size + self.hidden_size, self.hidden_size ) # embed_size+hidden_size means:concat input with the output of the last step. self.h_projection = nn.Linear( 2 * self.hidden_size, self.hidden_size, bias=False) # used to init the Hidden state of Decoder self.c_projection = nn.Linear( 2 * self.hidden_size, self.hidden_size, bias=False) # used to init the Cell state of Decoder self.att_projection = nn.Linear( 2 * self.hidden_size, self.hidden_size, bias=False ) # change Encoder hidden state which is (2h, 1) to (h, 1) self.combined_output_projection = nn.Linear( 3 * self.hidden_size, self.hidden_size, bias=False) # attention是把Encoder的每个隐藏层乘上softmax得到的权重再求和 self.dropout = nn.Dropout(dropout_rate) # 用于 dropout 最后一个隐藏层状态 self.target_vocab_projection = nn.Linear( self.hidden_size, len(vocab.tgt), bias=False # 把最后的隐藏层状态 projection 到词典维度,softmax后,得到每个词的概率 )
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout # LSTM parameters # input_size – The number of expected features in the input x # hidden_size – The number of features in the hidden state h # num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1 # bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True # batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False # dropout – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0 # bidirectional – If True, becomes a bidirectional LSTM. Default: False self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=1, bias=True, bidirectional=True) # LSTMCell parameters # input_size – The number of expected features in the input x # hidden_size – The number of features in the hidden state h # bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size, hidden_size=hidden_size, bias=True) # Linear parameters # in_features – size of each input sample # out_features – size of each output sample # bias – If set to False, the layer will not learn an additive bias. Default: True self.h_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.c_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.att_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.combined_output_projection = nn.Linear(in_features=3 * hidden_size, out_features=hidden_size, bias=False) self.target_vocab_projection = nn.Linear(in_features=hidden_size, out_features=len( self.vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() src_embed_size = 768 # given by BERT model tgt_embed_size = embed_size self.model_embeddings = ModelEmbeddings(src_embed_size, tgt_embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout # self.encoder = nn.LSTM(embed_size, self.hidden_size, num_layers = 1, dropout=self.dropout_rate, bidirectional=True) self.encoder = nn.LSTM(src_embed_size, self.hidden_size, num_layers=8, dropout=self.dropout_rate, bidirectional=True) self.decoder = nn.LSTMCell( self.hidden_size * 2, self.hidden_size ) # input side of decoder is the output size of self.h_projection, so self.hidden_size self.h_projection = nn.Linear( self.hidden_size * 2, self.hidden_size, bias=False ) #inpu t is double of hidden size and output is hidden size self.c_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.att_projection = nn.Linear(self.hidden_size * 2, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(self.hidden_size * 3, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # V_t * h self.dropout = nn.Dropout(self.dropout_rate)
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab self.embed_size = embed_size ### COPY OVER YOUR CODE FROM ASSIGNMENT 4 # LSTM is an RNN self.encoder = nn.LSTM(input_size=embed_size, hidden_size=hidden_size, bidirectional=True, bias=True) # LSTMCell is just one Cell self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size, hidden_size=hidden_size, bias=True) self.h_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.c_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.att_projection = nn.Linear(in_features=2 * hidden_size, out_features=hidden_size, bias=False) self.combined_output_projection = nn.Linear(in_features=3 * hidden_size, out_features=hidden_size, bias=False) self.target_vocab_projection = nn.Linear(in_features=hidden_size, out_features=len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) ### END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, spectrum_cnn_kernel_size=3, location_attention_window=64, no_char_decoder=False): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() # self.voiceCNN = VoiceCNN(embed_size, 5) self.location_attention_window = location_attention_window self.spectrum_cnn_kernel_size = spectrum_cnn_kernel_size self.spectrumCNN = nn.Conv1d(embed_size, embed_size, self.spectrum_cnn_kernel_size) # self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src) self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt) self.embed_size = embed_size self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # COPY OVER YOUR CODE FROM ASSIGNMENT 4 self.encoder = torch.nn.LSTM(embed_size, hidden_size, bidirectional=True) self.decoder = torch.nn.LSTMCell(embed_size + hidden_size, hidden_size) self.h_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.c_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.loc_window = 5 self.loc_att_projection = torch.nn.Linear(embed_size, 1, bias=False) self.loc_att_conv1D = nn.Conv1d(self.loc_window, embed_size, 1) self.att_projection = torch.nn.Linear(2 * hidden_size, hidden_size, bias=False) self.combined_output_projection = torch.nn.Linear(3 * hidden_size, hidden_size, bias=False) self.target_vocab_projection = torch.nn.Linear(hidden_size, len(vocab.tgt), bias=False) self.dropout = nn.Dropout(p=dropout_rate) # END YOUR CODE FROM ASSIGNMENT 4 if not no_char_decoder: self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt) else: self.charDecoder = None
def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2): """ Init NMT Model. @param embed_size (int): Embedding size (dimensionality) @param hidden_size (int): Hidden Size (dimensionality) @param vocab (Vocab): Vocabulary object containing src and tgt languages See vocab.py for documentation. @param dropout_rate (float): Dropout probability, for attention """ super(NMT, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab # default values self.encoder = None self.decoder = None self.h_projection = None self.c_projection = None self.att_projection = None self.combined_output_projection = None self.target_vocab_projection = None self.dropout = None ### YOUR CODE HERE (~8 Lines) ### TODO - Initialize the following variables: ### self.encoder (Bidirectional LSTM with bias) ### self.decoder (LSTM Cell with bias) ### self.h_projection (Linear Layer with no bias), called W_{h} in the PDF. ### self.c_projection (Linear Layer with no bias), called W_{c} in the PDF. ### self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF. ### self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF. ### self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF. ### self.dropout (Dropout Layer) ### ### Use the following docs to properly initialize these variables: ### LSTM: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM ### LSTM Cell: ### https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell ### Linear Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Linear ### Dropout Layer: ### https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout ### (c)按照TODO中要求进行初始化 ## LSTM层,输入的参数列表包括 ##input_size:输入维度(即词向量的维度) ##hidden_size:隐藏层的维度(h的维度) ##num_layers:LSTM的层数(纵向堆叠深度) ##bias:是否需要偏置,默认为True ##batch_first:是否需要调换将batch作为第一维度(针对非(batch_size,seq_length,embedding_dim)的输入),默认为False ##dropout:dropout,默认为0 ##bidirectional:LSTM双向与否,默认False self.encoder = nn.LSTM(embed_size, self.hidden_size, bias=True, bidirectional=True) ## LSTMCell层,单个的LSTM单元(结构上也恰好构成单层LSTM) ##(与LSTM不同处在于,LSTMCell输入为单个的x_t,而LSTM为序列x_0...x_T。如果需要跑完整个序列LSTMCell需要使用循环) ## 输入的参数列表包括 ##input_size:输入维度(即词向量的维度) ##hidden_size:隐藏层的维度(h的维度) ##bias:是否需要偏置,默认为True self.decoder = nn.LSTMCell(embed_size + self.hidden_size, self.hidden_size, bias=True) ## 线性层,顾名思义。参数依次为:input_dimension,output_dimension,是否需要bias self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size, bias=False) self.combined_output_projection = nn.Linear(3 * self.hidden_size, self.hidden_size, bias=False) self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) ##dropout层,设置dropout_rate self.dropout = nn.Dropout(dropout_rate)
class Node2(nn.Module): """ Node Class that inherits the BiLSTM models created in bilstm_model.py Beginning Node models have 2 BiLSTMs, future versions can have more """ def __init__(self, embed_size, hidden_size, vocab, dropout_rate): super(Node2, self).__init__() self.model_embeddings = ModelEmbeddings(embed_size, vocab) self.hidden_size = hidden_size self.dropout_rate = dropout_rate self.vocab = vocab print("vocab.num_labels: ", vocab.num_labels) self.num_labels = vocab.num_labels self.encoder0 = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, bias=True, # dropout=self.dropout_rate, bidirectional=True) self.encoder1 = nn.LSTM( input_size=embed_size, hidden_size=hidden_size, bias=True, # dropout=self.dropout_rate, bidirectional=True) self.dropout1 = nn.Dropout() self.attention_projection = nn.Linear(in_features=2 * hidden_size, out_features=self.num_labels, bias=False) self.attention_softmax = nn.Softmax(dim=0) # self.labels_projection = nn.Linear(in_features=2*hidden_size, # out_features=1, # bias=False) self.labels_projection = nn.Linear(in_features=2 * hidden_size, out_features=100, bias=False) self.labels_projection2 = nn.Linear(in_features=100, out_features=1, bias=False) def forward(self, in_sents: List[List[str]], target_labels: List[List[int]]): # in_sents should be (1000, whatever) # split in half num_notes = len(in_sents) length_of_each_note = int(len(in_sents[0]) / 2) # Convert list of lists into tensors source_padded = self.vocab.notes_.to_input_tensor(in_sents, device=self.device) # Tensor: (src_len, b) # print(num_notes) X = self.model_embeddings.note_embeds(source_padded) # print("X.shape: ", X.shape) # (1000, 16, 256) # print(in_sents[0]) X0 = X[:length_of_each_note, :, :] X1 = X[length_of_each_note:, :, :] enc_hiddens0, _ = self.encoder0(X0) enc_hiddens1, _ = self.encoder1(X1) # print(enc_hiddens0.shape) # print(enc_hiddens1.shape) enc_hiddens = torch.cat([enc_hiddens0, enc_hiddens1], 0) # print(enc_hiddens.shape) # scores0 = self.first_bilstm(in_sents0,target_labels) # scores1 = self.second_bilstm(in_sents1,target_labels) # print(scores0[0]) # print(scores1[0]) alpha = self.attention_projection(enc_hiddens) # print("alpha.shape: ", alpha.shape) # alpha = self.dropout1(alpha) alpha_soft = self.attention_softmax(alpha) # print(np.sum(alpha_soft.detach().numpy(),axis=0)) M = torch.bmm(alpha_soft.permute([1, 2, 0]), enc_hiddens.permute([1, 0, 2])) # print("M.shape: ", M.shape) # torch.stack(combined_outputs, dim=0) M = self.dropout1(M) M = self.labels_projection(M) M = F.relu(M) scores = self.labels_projection2(M) scores = torch.sigmoid(torch.squeeze(scores, -1)) # print("scores.shape: ", scores.shape) return scores # # print("alpha_soft.shape: ", alpha_soft.shape) # # print("alpha_permuted shape: ", alpha_soft.permute([2,1,0]).shape) # M = torch.bmm(alpha_soft.permute([1,2,0]), enc_hiddens.permute([1,0,2])) # # print("M.shape: ", M.shape) # # torch.stack(combined_outputs, dim=0) # scores = self.labels_projection(M) # # print(scores) # # print(F.sigmoid(scores)) # scores = torch.sigmoid(torch.squeeze(scores,-1)) # # print("scores.shape: ", scores.shape) # # print("scores squeezed shape: ", torch.squeeze(scores,-1).shape) # # print("scores: \n", scores) # # scores = 0. # return scores def encode( self, source_padded: torch.Tensor, source_lengths: List[int] ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]: pass @property def device(self) -> torch.device: """ Determine which device to place the Tensors upon, CPU or GPU. """ return self.model_embeddings.note_embeds.weight.device @staticmethod def load(model_path: str): """ Load the model from a file. @param model_path (str): path to model """ # params = torch.load(model_path, map_location=lambda storage, loc: storage) # args = params['args'] # model = NMT(vocab=params['vocab'], **args) # model.load_state_dict(params['state_dict']) params = torch.load(model_path, map_location=lambda storage, loc: storage) args = params['args'] model = Node(vocab=params['vocab'], **args) model.load_state_dict(params['state_dict']) return model def save(self, path: str): """ Save the odel to a file. @param path (str): path to the model """ print('save model parameters to [%s]' % path, file=sys.stderr) # params = { # 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), # 'vocab': self.vocab, # 'state_dict': self.state_dict() # } params = { 'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate), 'vocab': self.vocab, 'state_dict': self.state_dict() } torch.save(params, path)