def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # Bidirectional LSTM with bias
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=self.hidden_size,
                               bidirectional=True,
                               bias=True)
        # LSTM Cell with bias
        self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size,
                                   hidden_size=self.hidden_size,
                                   bias=True)
        # Linear Layer with no bias, W_{h}
        self.h_projection = nn.Linear(in_features=self.hidden_size * 2,
                                      out_features=self.hidden_size,
                                      bias=False)
        # Linear Layer with no bias, W_{c}
        self.c_projection = nn.Linear(in_features=self.hidden_size * 2,
                                      out_features=self.hidden_size,
                                      bias=False)
        # Linear Layer with no bias, W_{attProj}
        self.att_projection = nn.Linear(in_features=self.hidden_size * 2,
                                        out_features=self.hidden_size,
                                        bias=False)
        # Linear Layer with no bias, W_{u}
        self.combined_output_projection = nn.Linear(
            in_features=self.hidden_size * 3,
            out_features=self.hidden_size,
            bias=False)
        # Linear Layer with no bias, W_{vocab}
        self.target_vocab_projection = nn.Linear(in_features=self.hidden_size,
                                                 out_features=len(vocab.tgt),
                                                 bias=False)
        # Dropout Layer
        self.dropout = nn.Dropout(p=self.dropout_rate)
Esempio n. 2
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

        self.encoder = torch.nn.LSTM(
            input_size=embed_size,
            hidden_size=self.hidden_size,
            bias=True,
            bidirectional=True,
        )
        self.decoder = torch.nn.LSTMCell(
            input_size=embed_size + hidden_size,
            hidden_size=self.hidden_size,
            bias=True,
        )
        self.h_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.c_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.att_projection = torch.nn.Linear(
            in_features=2 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.combined_output_projection = torch.nn.Linear(
            in_features=3 * self.hidden_size, out_features=self.hidden_size, bias=False
        )
        self.target_vocab_projection = torch.nn.Linear(
            in_features=self.hidden_size, out_features=len(self.vocab.tgt), bias=False
        )
        self.dropout = torch.nn.Dropout(p=self.dropout_rate)
Esempio n. 3
0
    def __init__(self,
                 embed_size,
                 hidden_size,
                 vocab,
                 dropout_rate=0.2,
                 no_char_decoder=False):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (VocabEntry): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src)
        # print(self.model_embeddings_source.parameter_counter)

        self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt)

        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size)

        self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
        self.att_projection = nn.Linear(hidden_size * 2,
                                        hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(hidden_size * 2 +
                                                    hidden_size,
                                                    hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size,
                                                 len(vocab.tgt),
                                                 bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

        if not no_char_decoder:
            self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
            self.charDecoder = None
Esempio n. 4
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               num_layers=1,
                               bias=True,
                               bidirectional=True)
        self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size,
                                   hidden_size=hidden_size,
                                   bias=True)
        self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2 * hidden_size,
                                        hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(3 * hidden_size,
                                                    hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)
        self.dropout = nn.Dropout2d(p=dropout_rate)
Esempio n. 5
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None


        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
		self.encoder = nn.LSTM(input_size=embed_size,
			hidden_size=self.hidden_size, num_layers=1,
			bias=True, batch_first=false,
			dropout=self.dropout_rate, bidirectional=True)
		self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size, hidden_size=self.hidden_size, bias=True) #input should be hidden_size (from encoder)+Embed of output language 
		Self.h_projection = nn.Linear(in_features=2*self.hidden_size, out_features=self.hidden_size)
		self.c_projection = nn.linear(in_features=2*self.hidden_size, out_features=self.hidden_size)
		self.att_projection = nn.linear(in_features=2*self.hidden_size, out_features=self.hidden_size)
		self.combined_output_projection = nn.linear(in_features=3*self.hidden_size, out_features=self.hidden_size)
		self.target_vocab_projection = nn.linear(in_features=self.hidden_size, out_features=self.model_embeddings.target.shape[0]) 
		self.dropout = nn.Dropout(drop=self.dropout_rate , impulse=False)
Esempio n. 6
0
 def __init__(self,
              embed_size,
              hidden_size,
              src_vocab: Vocabulary,
              dst_vocab: Vocabulary,
              device,
              dropout_rate=0.2):
     super(NMT, self).__init__()
     self.device = device
     self.model_embeddings = ModelEmbeddings(embed_size, src_vocab,
                                             dst_vocab)
     self.hidden_size = hidden_size
     self.src_vocab = src_vocab
     self.dst_vocab = dst_vocab
     self.dropout_rate = dropout_rate
     # encoder是双向LSTM,有bias
     self.encoder = nn.LSTM(input_size=embed_size,
                            hidden_size=hidden_size,
                            bidirectional=True,
                            dropout=dropout_rate,
                            bias=True)
     # decoder是单向LSTM,有bias
     self.decoder = nn.LSTMCell(
         input_size=embed_size + hidden_size,
         # input-feeding方法:将注意力向量和下一个时间步的输入连接在一起,使模型在做对齐决策时,也会考虑过去的对齐信息
         hidden_size=hidden_size,
         bias=True)
     # h_projection, c_projection分别是src对decoder状态和cell的初始化
     self.h_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
     self.c_projection = nn.Linear(hidden_size * 2, hidden_size, bias=False)
     # att_projection是src对decoder隐空间的映射(到context vector)
     self.att_projection = nn.Linear(hidden_size * 2,
                                     hidden_size,
                                     bias=False)
     # attention向量和下个时间步的输入连接在一起输入decoder
     self.combined_output_projection = nn.Linear(hidden_size * 2 +
                                                 hidden_size,
                                                 hidden_size,
                                                 bias=False)
     # decoder神经网络的输入到vocab的映射
     self.target_vocab_projection = nn.Linear(hidden_size,
                                              len(dst_vocab),
                                              bias=False)
     self.dropout = nn.Dropout(dropout_rate)
Esempio n. 7
0
    def __init__(self, word_embed_size, hidden_size, vocab, dropout_rate=0.3,
                 no_char_decoder=False):
        """ Init NMT Model.

        @param word_embed_size (int): Embedding size (dimensionality) of word
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = ModelEmbeddings(word_embed_size, vocab.src)
        self.model_embeddings_target = ModelEmbeddings(word_embed_size, vocab.tgt)

        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # COPY OVER YOUR CODE FROM ASSIGNMENT 4

        self.encoder = nn.LSTM(word_embed_size, self.hidden_size, bidirectional=True,
                               bias=True)
        self.decoder = nn.LSTMCell(word_embed_size + self.hidden_size, self.hidden_size,
                                   bias=True)
        self.h_projection = nn.Linear(2 * self.hidden_size, self.hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(2 * self.hidden_size, self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(2 * self.hidden_size, self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(3 * self.hidden_size,
                                                    self.hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(vocab.tgt), bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

        # END YOUR CODE FROM ASSIGNMENT 4

        if not no_char_decoder:
            self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
            self.charDecoder = None
Esempio n. 8
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab) # src&tgt Embedding Init
        self.hidden_size = hidden_size # hidden size
        self.dropout_rate = dropout_rate # Dropout
        self.vocab = vocab # 

        # default values
        self.encoder = None 
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None


        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        self.encoder = nn.LSTM(embed_size, hidden_size, bias=True, bidirectional=True)
        self.decoder = nnn.LSTMCell(embed_size+hidden_size, hidden_size, bias=True)
        self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False) 
        self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)
Esempio n. 9
0
def load_dev_data(embed_size=50, dev_perct=1., binary=False):
    M = ModelEmbeddings(embed_size=embed_size)
    X = [
        labeledTree.to_labeled_lines()[0][1].split(" ")
        for labeledTree in data['dev']
    ]
    Y = [labeledTree.to_labeled_lines()[0][0] for labeledTree in data['dev']]

    if binary:
        X = [x for (x, y) in list(zip(X, Y)) if y != 3]
        Y = [1 if y > 3 else 0 for y in Y if y != 3]

    dev_size = int(len(X) * dev_perct)
    X = X[:dev_size]
    Y = Y[:dev_size]
    X = M.embed_sentence(X)

    # dev data doesn't need to be augmented, hence it's already zipped and
    # ready to be passed into model.forward()
    return list(zip(X, Y))
Esempio n. 10
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for  documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ###
        ### YOUR CODE HERE (~8 Lines)
        ###
        self.embed_size = embed_size
        self.encoder = nn.LSTM(self.embed_size, self.hidden_size, bias=True, bidirectional=True) # do i have to do make self.embed_size? Also I think since bidirectional is specified I don't need to use 2*self.hidden_size
        self.decoder = nn.LSTMCell(self.hidden_size + embed_size, self.hidden_size, bias=True) # need input size, hidden size. I think they are the same, except that for the input you concatenate the embedding for the current word. 
        self.h_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_h 
        self.c_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_c
        self.att_projection = nn.Linear(2*self.hidden_size, self.hidden_size, bias=False) # W_attProj Not sure about this one; it seems to actually take two inputs, h^dec_t to the left and h^enc_i to the right.
        self.combined_output_projection = nn.Linear(3*self.hidden_size, self.hidden_size, bias=False) # W_u
        self.target_vocab_projection = nn.Linear(self.hidden_size, len(self.vocab.tgt), bias=False) # W_vocab. Is len(self.vocab.tgt) the length of the target vocab? that is what we want.
        self.dropout = nn.Dropout(self.dropout_rate) #Dropout layer.
        ###
        ### END YOUR CODE
        ###
        '''TODO - Initialize the following variables:
Esempio n. 11
0
    def __init__(self, vocab, embed_size, hidden_size, output_size, batch_size, dropout_rate=0.2):
        super(LSTMClassifier, self).__init__()
        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.batch_size = batch_size

        self.embedding = ModelEmbeddings(vocab, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, bidirectional=False)
        self.proj = nn.Linear(hidden_size, output_size, bias=True)
        self.dropout = nn.Dropout(dropout_rate)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
Esempio n. 12
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2, no_char_decoder=False):
        """ Initalize the NMT Model.

        :param int embed_size: Embedding size (dimensionality)
        :param int hidden_size: Hidden Size (dimensionality)
        :param Vocab vocab: Vocabulary object containing src and tgt languages
                             See vocab.py for documentation.
        :param float dropout_rate: Dropout probability, for the attention combination layer
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src)
        self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt)

        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        self.encoder = nn.LSTM(embed_size, hidden_size, bidirectional=True)
        self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size)

        # Need to feed in transpose of [h_enc(1)(<-) ; h_enc(m)(->)], and output is 1xh
        self.h_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False)
        # Need to feed in transpose of [c_enc(1)(<-); c_enc(m)(->)], and output is 1xh
        self.c_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False)

        self.att_projection = nn.Linear(2 * hidden_size, hidden_size, bias=False)

        # Need to feed in transpose of u(t), and output is 1xh (v(t))
        self.combined_output_projection = nn.Linear(3 * hidden_size, hidden_size, bias=False)

        # Need to feed in transpose of o(t), and output is 1x|Vtg|
        self.target_vocab_projection = nn.Linear(hidden_size, len(vocab.tgt), bias=False)

        self.dropout = nn.Dropout(dropout_rate)

        if not no_char_decoder:
            self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
            self.charDecoder = None
Esempio n. 13
0
 def __init__(self,
              vocab,
              embed_size,
              hidden_size,
              enc_bidir,
              attn_size,
              dropout=0.2):
     super(QGModel, self).__init__()
     self.vocab = vocab
     self.args = {
         'embed_size': embed_size,
         'hidden_size': hidden_size,
         'dropout': dropout,
         'enc_bidir': enc_bidir,
         'attn_size': attn_size
     }
     self.embeddings = ModelEmbeddings(embed_size, vocab)
     self.encoder = Encoder(embed_size, hidden_size, dropout, enc_bidir)
     self.decoder_init_hidden_proj = nn.Linear(self.encoder.hidden_size,
                                               hidden_size)
     self.decoder = Decoder(embed_size, hidden_size, attn_size,
                            len(vocab.tgt), dropout)
Esempio n. 14
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ 初始化 NMT 模型.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): 词总述,包括 src 和 tgt
        @param dropout_rate (float): 对注意力的dropout概率
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # 初始化各层次
        # LSTM层 输入词嵌入,输出隐藏状态
        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               dropout=self.dropout_rate,
                               bidirectional=True)  # 可以选择双向
        # LSTMCell 输入词嵌入与隐藏状态连接,输出隐藏状态
        self.decoder = nn.LSTMCell(embed_size + self.hidden_size,
                                   self.hidden_size)  # 可以控制每个时间步
        self.h_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)  # 降维2h->h
        self.c_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)  # 降维2h->h
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)  # 降维2h->h
        self.combined_output_projection = nn.Linear(self.hidden_size * 3,
                                                    self.hidden_size,
                                                    bias=False)  # 降维3h->h
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)  # 输出投影到词库
        self.dropout = nn.Dropout(p=self.dropout_rate)
Esempio n. 15
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate):
        super(Node2, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        print("vocab.num_labels: ", vocab.num_labels)
        self.num_labels = vocab.num_labels

        self.encoder0 = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            bias=True,
            # dropout=self.dropout_rate,
            bidirectional=True)

        self.encoder1 = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            bias=True,
            # dropout=self.dropout_rate,
            bidirectional=True)

        self.dropout1 = nn.Dropout()

        self.attention_projection = nn.Linear(in_features=2 * hidden_size,
                                              out_features=self.num_labels,
                                              bias=False)
        self.attention_softmax = nn.Softmax(dim=0)
        #         self.labels_projection = nn.Linear(in_features=2*hidden_size,
        #                                           out_features=1,
        #                                           bias=False)
        self.labels_projection = nn.Linear(in_features=2 * hidden_size,
                                           out_features=100,
                                           bias=False)

        self.labels_projection2 = nn.Linear(in_features=100,
                                            out_features=1,
                                            bias=False)
Esempio n. 16
0
    def __init__(self, vocab, embed_size, embeddings, sim_scale=5):
        """
        @param vocab (Vocab): vocab object
        @param embed_size (int): embedding size
        @param embeddings (torch.tensor (len(vocab), embed_dim)): pretrained word embeddings
        @param sim_scale (float): scale the sim score by this scalar
        """
        super(AvgSim, self).__init__()
        self.pretrained_embeddings = embeddings
        self.embeddings = ModelEmbeddings(vocab, embed_size, self.pretrained_embeddings)
        self.vocab = vocab
        self.sim_scale = sim_scale

        self.scoring_fn = nn.CosineSimilarity(dim=-1)
Esempio n. 17
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for  documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None
Esempio n. 18
0
 def __init__(self,
              input_size,
              hidden_size,
              vocab,
              fasttext_model,
              device='cpu'):
     super(LSTMModel, self).__init__()
     self.hidden_size = hidden_size
     self.input_size = input_size
     self.vocab = vocab
     self.embedding = ModelEmbeddings(input_size, vocab, fasttext_model,
                                      device)
     self.lstm = nn.LSTM(input_size, hidden_size, bidirectional=True)
     self.linear = nn.Linear(self.hidden_size * 2,
                             self.hidden_size,
                             bias=True)
     self.linear2 = nn.Linear(self.hidden_size, self.hidden_size, bias=True)
     self.attention = Attention(self.hidden_size)
Esempio n. 19
0
    def __init__(self, vocab, embed_size, embeddings, hidden_size,
                 dropout_rate):
        """
        @param vocab (Vocab): vocab object
        @param embed_size (int): embedding size
        @param embeddings (torch.tensor (len(vocab), embed_dim)): pretrained word embeddings
        @param hidden_size (int): hidden size
        @param dropout_rate (float): dropout prob
        """
        super(NeuralModel, self).__init__()
        self.pretrained_embeddings = embeddings
        self.embeddings = ModelEmbeddings(vocab, embed_size,
                                          self.pretrained_embeddings)
        self.vocab = vocab
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate

        self.h_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)
        self.combined_out_projection = nn.Linear(self.hidden_size * 3,
                                                 self.hidden_size,
                                                 bias=False)
        self.vocab_projection = nn.Linear(self.hidden_size,
                                          len(self.vocab),
                                          bias=False)

        self.dropout = nn.Dropout(self.dropout_rate)

        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=self.hidden_size,
                               bias=True,
                               bidirectional=True)

        self.decoder = nn.LSTMCell(input_size=embed_size + self.hidden_size,
                                   hidden_size=self.hidden_size,
                                   bias=True)
Esempio n. 20
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate):
        super(Node, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        print("vocab.num_labels: ", vocab.num_labels)
        self.num_labels = vocab.num_labels
        
#         self.encoder = nn.LSTM(input_size=embed_size,
#                                hidden_size=hidden_size, 
#                                bias=True, 
#                                # dropout=self.dropout_rate,
#                                bidirectional=True)
        
        self.first_bilstm = BiLSTM(embed_size=embed_size,
                                    hidden_size=hidden_size,
                                    dropout_rate=dropout_rate,
                                    vocab=vocab)
        self.second_bilstm = BiLSTM(embed_size=embed_size,
                                    hidden_size=hidden_size,
                                    dropout_rate=dropout_rate,
                                    vocab=vocab)
Esempio n. 21
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None
        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               bidirectional=True)
        self.decoder = nn.LSTMCell(self.hidden_size + embed_size,
                                   self.hidden_size)
        self.h_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(2 * self.hidden_size,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(3 * self.hidden_size,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)
        self.dropout = nn.Dropout(self.dropout_rate)

        ### END YOUR CODE

    def forward(self, source: List[List[str]],
                target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(
            source, device=self.device)  # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(
            target, device=self.device)  # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded,
                                                  source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state,
                                       target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs),
                          dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()

        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(
            P, index=target_padded[1:].unsqueeze(-1),
            dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        ### YOUR CODE HERE (~ 8 Lines)
        ### TODO:
        ###     1. Construct Tensor `X` of source sentences with shape (src_len, b, e) using the source model embeddings.
        ###         src_len = maximum source sentence length, b = batch size, e = embedding size. Note
        ###         that there is no initial hidden state or cell for the decoder.
        ###     2. Compute `enc_hiddens`, `last_hidden`, `last_cell` by applying the encoder to `X`.
        ###         - Before you can apply the encoder, you need to apply the `pack_padded_sequence` function to X.
        ###         - After you apply the encoder, you need to apply the `pad_packed_sequence` function to enc_hiddens.
        ###         - Note that the shape of the tensor returned by the encoder is (src_len, b, h*2) and we want to
        ###           return a tensor of shape (b, src_len, h*2) as `enc_hiddens`.
        ###     3. Compute `dec_init_state` = (init_decoder_hidden, init_decoder_cell):
        ###         - `init_decoder_hidden`:
        ###             `last_hidden` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the h_projection layer to this in order to compute init_decoder_hidden.
        ###             This is h_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###         - `init_decoder_cell`:
        ###             `last_cell` is a tensor shape (2, b, h). The first dimension corresponds to forwards and backwards.
        ###             Concatenate the forwards and backwards tensors to obtain a tensor shape (b, 2*h).
        ###             Apply the c_projection layer to this in order to compute init_decoder_cell.
        ###             This is c_0^{dec} in the PDF. Here b = batch size, h = hidden size
        ###
        ### See the following docs, as you may need to use some of the following functions in your implementation:
        ###     Pack the padded sequence X before passing to the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pack_padded_sequence
        ###     Pad the packed sequence, enc_hiddens, returned by the encoder:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.utils.rnn.pad_packed_sequence
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Permute:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.permute
        X = self.model_embeddings.source(source_padded)
        X = pack_padded_sequence(X, source_lengths)

        enc_hiddens, (last_hidden, last_cell) = self.encoder(X)
        enc_hiddens, _ = pad_packed_sequence(enc_hiddens)
        enc_hiddens = enc_hiddens.permute(1, 0, 2)

        init_decoder_hidden = self.h_projection(
            torch.cat((last_hidden[0], last_hidden[1]), 1))
        init_decoder_cell = self.c_projection(
            torch.cat((last_cell[0], last_cell[1]), 1))

        dec_init_state = (init_decoder_hidden, init_decoder_cell)
        # print(enc_hiddens.shape, init_decoder_cell.shape, init_decoder_hidden.shape)
        ### END YOUR CODE

        return enc_hiddens, dec_init_state

    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
               dec_init_state: Tuple[torch.Tensor, torch.Tensor],
               target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size.

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []

        ### YOUR CODE HERE (~9 Lines)
        ### TODO:
        ###     1. Apply the attention projection layer to `enc_hiddens` to obtain `enc_hiddens_proj`,
        ###         which should be shape (b, src_len, h),
        ###         where b = batch size, src_len = maximum source length, h = hidden size.
        ###         This is applying W_{attProj} to h^enc, as described in the PDF.
        ###     2. Construct tensor `Y` of target sentences with shape (tgt_len, b, e) using the target model embeddings.
        ###         where tgt_len = maximum target sentence length, b = batch size, e = embedding size.
        ###     3. Use the torch.split function to iterate over the time dimension of Y.
        ###         Within the loop, this will give you Y_t of shape (1, b, e) where b = batch size, e = embedding size.
        ###             - Squeeze Y_t into a tensor of dimension (b, e).
        ###             - Construct Ybar_t by concatenating Y_t with o_prev on their last dimension
        ###             - Use the step function to compute the the Decoder's next (cell, state) values
        ###               as well as the new combined output o_t.
        ###             - Append o_t to combined_outputs
        ###             - Update o_prev to the new o_t.
        ###     4. Use torch.stack to convert combined_outputs from a list length tgt_len of
        ###         tensors shape (b, h), to a single tensor shape (tgt_len, b, h)
        ###         where tgt_len = maximum target sentence length, b = batch size, h = hidden size.
        ###
        ### Note:
        ###    - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###      over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### You may find some of these functions useful:
        ###     Zeros Tensor:
        ###         https://pytorch.org/docs/stable/torch.html#torch.zeros
        ###     Tensor Splitting (iteration):
        ###         https://pytorch.org/docs/stable/torch.html#torch.split
        ###     Tensor Dimension Squeezing:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tensor Stacking:
        ###         https://pytorch.org/docs/stable/torch.html#torch.stack

        enc_hiddens_proj = self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        for i in torch.split(Y, 1, dim=0):
            Y_t = i.squeeze(dim=0)
            Ybar_t = torch.cat((Y_t, o_prev), 1)
            dec_state, o_t, e_t = self.step(Ybar_t, dec_state, enc_hiddens,
                                            enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t
        combined_outputs = torch.stack(combined_outputs, dim=0)

        ### END YOUR CODE

        return combined_outputs

    def step(
            self, Ybar_t: torch.Tensor, dec_state: Tuple[torch.Tensor,
                                                         torch.Tensor],
            enc_hiddens: torch.Tensor, enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor
    ) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length.

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        ### YOUR CODE HERE (~3 Lines)
        ### TODO:
        ###     1. Apply the decoder to `Ybar_t` and `dec_state`to obtain the new dec_state.
        ###     2. Split dec_state into its two parts (dec_hidden, dec_cell)
        ###     3. Compute the attention scores e_t, a Tensor shape (b, src_len).
        ###        Note: b = batch_size, src_len = maximum source length, h = hidden size.
        ###
        ###       Hints:
        ###         - dec_hidden is shape (b, h) and corresponds to h^dec_t in the PDF (batched)
        ###         - enc_hiddens_proj is shape (b, src_len, h) and corresponds to W_{attProj} h^enc (batched).
        ###         - Use batched matrix multiplication (torch.bmm) to compute e_t (be careful about the input/ output shapes!)
        ###         - To get the tensors into the right shapes for bmm, you will need to do some squeezing and unsqueezing.
        ###         - When using the squeeze() function make sure to specify the dimension you want to squeeze
        ###             over. Otherwise, you will remove the batch dimension accidentally, if batch_size = 1.
        ###
        ### Use the following docs to implement this functionality:
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor Unsqueeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.unsqueeze
        ###     Tensor Squeeze:
        ###         https://pytorch.org/docs/stable/torch.html#torch.squeeze

        dec_state = self.decoder(Ybar_t, dec_state)
        (dec_hidden, dec_cell) = dec_state
        e_t = torch.squeeze(torch.bmm(enc_hiddens_proj,
                                      torch.unsqueeze(dec_hidden, dim=2)),
                            dim=2)

        ### END YOUR CODE

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.bool(), -float('inf'))

        ### YOUR CODE HERE (~6 Lines)
        ### TODO:
        ###     1. Apply softmax to e_t to yield alpha_t
        ###     2. Use batched matrix multiplication between alpha_t and enc_hiddens to obtain the
        ###         attention output vector, a_t.
        ###           - alpha_t is shape (b, src_len)
        ###           - enc_hiddens is shape (b, src_len, 2h)
        ###           - a_t should be shape (b, 2h)
        ###           - You will need to do some squeezing and unsqueezing.
        ###     Note: b = batch size, src_len = maximum source length, h = hidden size.
        ###
        ###     3. Concatenate dec_hidden with a_t to compute tensor U_t
        ###     4. Apply the combined output projection layer to U_t to compute tensor V_t
        ###     5. Compute tensor O_t by first applying the Tanh function and then the dropout layer.
        ###
        ### Use the following docs to implement this functionality:
        ###     Softmax:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.functional.softmax
        ###     Batch Multiplication:
        ###        https://pytorch.org/docs/stable/torch.html#torch.bmm
        ###     Tensor View:
        ###         https://pytorch.org/docs/stable/tensors.html#torch.Tensor.view
        ###     Tensor Concatenation:
        ###         https://pytorch.org/docs/stable/torch.html#torch.cat
        ###     Tanh:
        ###         https://pytorch.org/docs/stable/torch.html#torch.tanh

        #print(e_t.shape)
        alpha_t = F.softmax(e_t, dim=1)
        a_t = torch.bmm(alpha_t.unsqueeze(dim=1), enc_hiddens).squeeze(1)
        U_t = torch.cat((dec_hidden, a_t), dim=1)
        V_t = self.combined_output_projection(U_t)
        O_t = self.dropout(torch.tanh(V_t))

        ### END YOUR CODE

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor,
                            source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size.
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.

        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0),
                                enc_hiddens.size(1),
                                dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)

    def beam_search(self,
                    src_sent: List[str],
                    beam_size: int = 5,
                    max_decoding_time_step: int = 70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var,
                                                  [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses),
                                 dtype=torch.float,
                                 device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses
                  ) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(
                hyp_num, src_encodings_att_linear.size(1),
                src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor(
                [self.vocab.tgt[hyp[-1]] for hyp in hypotheses],
                dtype=torch.long,
                device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _ = self.step(x,
                                                h_tm1,
                                                exp_src_encodings,
                                                exp_src_encodings_att_linear,
                                                enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t),
                                    dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (
                hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(
                contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(
                    prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(
                        Hypothesis(value=new_hyp_sent[1:-1],
                                   score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids,
                                        dtype=torch.long,
                                        device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores,
                                      dtype=torch.float,
                                      device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(
                Hypothesis(value=hypotheses[0][1:],
                           score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)
Esempio n. 22
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size, the size of hidden states (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None
        # For sanity check only, not relevant to implementation
        self.gen_sanity_check = False
        self.counter = 0

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               bidirectional=True,
                               bias=True)
        self.decoder = nn.LSTMCell(embed_size + self.hidden_size,
                                   self.hidden_size,
                                   bias=True)
        self.h_projection = nn.Linear(
            self.hidden_size * 2, self.hidden_size,
            bias=False)  # The final vector of hidden states
        self.c_projection = nn.Linear(
            self.hidden_size * 2, self.hidden_size,
            bias=False)  # The final vector of cell states
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(self.hidden_size * 3,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(
            self.hidden_size, len(self.vocab.tgt), bias=False
        )  # We don't need embedding since we don't need to embed the output on low dimensions?
        self.dropout = nn.Dropout(self.dropout_rate)
Esempio n. 23
0
class NMT(nn.Module):
    """ Simple Neural Machine Translation Model:
        - Bidrectional LSTM Encoder
        - Unidirection LSTM Decoder
        - Global Attention Model (Luong, et al. 2015)
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = nn.LSTM(embed_size, hidden_size, bias=True, bidirectional=True) 
        self.decoder = nn.LSTMCell(embed_size + hidden_size, hidden_size, bias=True)
        self.h_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.c_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.att_projection = nn.Linear(2*hidden_size, hidden_size, bias=False)
        self.combined_output_projection = nn.Linear(3*hidden_size, hidden_size, bias=False)
        self.target_vocab_projection = nn.Linear(len(vocab.tgt), hidden_size, bias=False)
        self.dropout = nn.Dropout()


    def forward(self, source: List[List[str]], target: List[List[str]]) -> torch.Tensor:
        """ Take a mini-batch of source and target sentences, compute the log-likelihood of
        target sentences under the language models learned by the NMT system.

        @param source (List[List[str]]): list of source sentence tokens
        @param target (List[List[str]]): list of target sentence tokens, wrapped by `<s>` and `</s>`

        @returns scores (Tensor): a variable/tensor of shape (b, ) representing the
                                    log-likelihood of generating the gold-standard target sentence for
                                    each example in the input batch. Here b = batch size.
        """
        # Compute sentence lengths
        source_lengths = [len(s) for s in source]

        # Convert list of lists into tensors
        source_padded = self.vocab.src.to_input_tensor(source, device=self.device)   # Tensor: (src_len, b)
        target_padded = self.vocab.tgt.to_input_tensor(target, device=self.device)   # Tensor: (tgt_len, b)

        ###     Run the network forward:
        ###     1. Apply the encoder to `source_padded` by calling `self.encode()`
        ###     2. Generate sentence masks for `source_padded` by calling `self.generate_sent_masks()`
        ###     3. Apply the decoder to compute combined-output by calling `self.decode()`
        ###     4. Compute log probability distribution over the target vocabulary using the
        ###        combined_outputs returned by the `self.decode()` function.

        enc_hiddens, dec_init_state = self.encode(source_padded, source_lengths)
        enc_masks = self.generate_sent_masks(enc_hiddens, source_lengths)
        combined_outputs = self.decode(enc_hiddens, enc_masks, dec_init_state, target_padded)
        P = F.log_softmax(self.target_vocab_projection(combined_outputs), dim=-1)

        # Zero out, probabilities for which we have nothing in the target text
        target_masks = (target_padded != self.vocab.tgt['<pad>']).float()
        
        # Compute log probability of generating true target words
        target_gold_words_log_prob = torch.gather(P, index=target_padded[1:].unsqueeze(-1), dim=-1).squeeze(-1) * target_masks[1:]
        scores = target_gold_words_log_prob.sum(dim=0)
        return scores


    def encode(self, source_padded: torch.Tensor, source_lengths: List[int]) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        """ Apply the encoder to source sentences to obtain encoder hidden states.
            Additionally, take the final states of the encoder and project them to obtain initial states for decoder.

        @param source_padded (Tensor): Tensor of padded source sentences with shape (src_len, b), where
                                        b = batch_size, src_len = maximum source sentence length. Note that 
                                       these have already been sorted in order of longest to shortest sentence.
        @param source_lengths (List[int]): List of actual lengths for each of the source sentences in the batch
        @returns enc_hiddens (Tensor): Tensor of hidden units with shape (b, src_len, h*2), where
                                        b = batch size, src_len = maximum source sentence length, h = hidden size.
        @returns dec_init_state (tuple(Tensor, Tensor)): Tuple of tensors representing the decoder's initial
                                                hidden state and cell.
        """
        enc_hiddens, dec_init_state = None, None

        X = self.model_embeddings.source(source_padded)
        packed_sequence = torch.nn.utils.rnn.pack_padded_sequence(X, source_lengths)
        enc_hiddens, (last_hidden, last_cell) = self.encoder(packed_sequence)
        enc_hiddens = torch.nn.utils.rnn.pad_packed_sequence(enc_hiddens)[0]
        enc_hiddens = enc_hiddens.permute(1,0,2)
        concat_last_hidden = torch.cat((last_hidden[0], last_hidden[1]), 1)
        concat_last_cell = torch.cat((last_cell[0], last_cell[1]), 1)
        dec_init_state = (self.h_projection(concat_last_hidden), self.c_projection(concat_last_cell))
        return enc_hiddens, dec_init_state


    def decode(self, enc_hiddens: torch.Tensor, enc_masks: torch.Tensor,
                dec_init_state: Tuple[torch.Tensor, torch.Tensor], target_padded: torch.Tensor) -> torch.Tensor:
        """Compute combined output vectors for a batch.

        @param enc_hiddens (Tensor): Hidden states (b, src_len, h*2), where
                                     b = batch size, src_len = maximum source sentence length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks (b, src_len), where
                                     b = batch size, src_len = maximum source sentence length.
        @param dec_init_state (tuple(Tensor, Tensor)): Initial state and cell for decoder
        @param target_padded (Tensor): Gold-standard padded target sentences (tgt_len, b), where
                                       tgt_len = maximum target sentence length, b = batch size. 

        @returns combined_outputs (Tensor): combined output tensor  (tgt_len, b,  h), where
                                        tgt_len = maximum target sentence length, b = batch_size,  h = hidden size
        """
        # Chop of the <END> token for max length sentences.
        target_padded = target_padded[:-1]

        # Initialize the decoder state (hidden and cell)
        dec_state = dec_init_state

        # Initialize previous combined output vector o_{t-1} as zero
        batch_size = enc_hiddens.size(0)
        o_prev = torch.zeros(batch_size, self.hidden_size, device=self.device)

        # Initialize a list we will use to collect the combined output o_t on each step
        combined_outputs = []


        enc_hiddens_proj =  self.att_projection(enc_hiddens)
        Y = self.model_embeddings.target(target_padded)
        
        for Y_t in torch.split(Y, 1):
            Y_t = torch.squeeze(Y_t)
            Ybar_t = torch.cat((Y_t, o_prev), 1)
            dec_state, o_t, et = self.step(Ybar_t, dec_state, enc_hiddens, enc_hiddens_proj, enc_masks)
            combined_outputs.append(o_t)
            o_prev = o_t

        combined_outputs = torch.stack(combined_outputs)

        return combined_outputs


    def step(self, Ybar_t: torch.Tensor,
            dec_state: Tuple[torch.Tensor, torch.Tensor],
            enc_hiddens: torch.Tensor,
            enc_hiddens_proj: torch.Tensor,
            enc_masks: torch.Tensor) -> Tuple[Tuple, torch.Tensor, torch.Tensor]:
        """ Compute one forward step of the LSTM decoder, including the attention computation.

        @param Ybar_t (Tensor): Concatenated Tensor of [Y_t o_prev], with shape (b, e + h). The input for the decoder,
                                where b = batch size, e = embedding size, h = hidden size.
        @param dec_state (tuple(Tensor, Tensor)): Tuple of tensors both with shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's prev hidden state, second tensor is decoder's prev cell.
        @param enc_hiddens (Tensor): Encoder hidden states Tensor, with shape (b, src_len, h * 2), where b = batch size,
                                    src_len = maximum source length, h = hidden size.
        @param enc_hiddens_proj (Tensor): Encoder hidden states Tensor, projected from (h * 2) to h. Tensor is with shape (b, src_len, h),
                                    where b = batch size, src_len = maximum source length, h = hidden size.
        @param enc_masks (Tensor): Tensor of sentence masks shape (b, src_len),
                                    where b = batch size, src_len is maximum source length. 

        @returns dec_state (tuple (Tensor, Tensor)): Tuple of tensors both shape (b, h), where b = batch size, h = hidden size.
                First tensor is decoder's new hidden state, second tensor is decoder's new cell.
        @returns combined_output (Tensor): Combined output Tensor at timestep t, shape (b, h), where b = batch size, h = hidden size.
        @returns e_t (Tensor): Tensor of shape (b, src_len). It is attention scores distribution.
                                Note: You will not use this outside of this function.
                                      We are simply returning this value so that we can sanity check
                                      your implementation.
        """

        combined_output = None

        dec_state = self.decoder(Ybar_t, dec_state)
        dec_hidden, dec_cell = dec_state
        e_t = torch.bmm(enc_hiddens_proj, torch.unsqueeze(dec_hidden, 2))
        e_t = torch.squeeze(e_t, 2)

        # Set e_t to -inf where enc_masks has 1
        if enc_masks is not None:
            e_t.data.masked_fill_(enc_masks.byte(), -float('inf'))

        alpha_t = torch.nn.functional.softmax(e_t, 1)
        a_t = torch.squeeze(torch.bmm(torch.unsqueeze(alpha_t,1), enc_hiddens), 1)
        U_t = torch.cat((a_t, dec_hidden), 1)
        V_t = self.combined_output_projection(U_t)
        O_t = torch.tanh(V_t)
        # print("Ot", O_t)
        # O_t = self.dropout(O_t)
        # print("Ot", O_t)

        combined_output = O_t
        return dec_state, combined_output, e_t

    def generate_sent_masks(self, enc_hiddens: torch.Tensor, source_lengths: List[int]) -> torch.Tensor:
        """ Generate sentence masks for encoder hidden states.

        @param enc_hiddens (Tensor): encodings of shape (b, src_len, 2*h), where b = batch size,
                                     src_len = max source length, h = hidden size. 
        @param source_lengths (List[int]): List of actual lengths for each of the sentences in the batch.
        
        @returns enc_masks (Tensor): Tensor of sentence masks of shape (b, src_len),
                                    where src_len = max source length, h = hidden size.
        """
        enc_masks = torch.zeros(enc_hiddens.size(0), enc_hiddens.size(1), dtype=torch.float)
        for e_id, src_len in enumerate(source_lengths):
            enc_masks[e_id, src_len:] = 1
        return enc_masks.to(self.device)


    def beam_search(self, src_sent: List[str], beam_size: int=5, max_decoding_time_step: int=70) -> List[Hypothesis]:
        """ Given a single source sentence, perform beam search, yielding translations in the target language.
        @param src_sent (List[str]): a single source sentence (words)
        @param beam_size (int): beam size
        @param max_decoding_time_step (int): maximum number of time steps to unroll the decoding RNN
        @returns hypotheses (List[Hypothesis]): a list of hypothesis, each hypothesis has two fields:
                value: List[str]: the decoded target sentence, represented as a list of words
                score: float: the log-likelihood of the target sentence
        """
        src_sents_var = self.vocab.src.to_input_tensor([src_sent], self.device)

        src_encodings, dec_init_vec = self.encode(src_sents_var, [len(src_sent)])
        src_encodings_att_linear = self.att_projection(src_encodings)

        h_tm1 = dec_init_vec
        att_tm1 = torch.zeros(1, self.hidden_size, device=self.device)

        eos_id = self.vocab.tgt['</s>']

        hypotheses = [['<s>']]
        hyp_scores = torch.zeros(len(hypotheses), dtype=torch.float, device=self.device)
        completed_hypotheses = []

        t = 0
        while len(completed_hypotheses) < beam_size and t < max_decoding_time_step:
            t += 1
            hyp_num = len(hypotheses)

            exp_src_encodings = src_encodings.expand(hyp_num,
                                                     src_encodings.size(1),
                                                     src_encodings.size(2))

            exp_src_encodings_att_linear = src_encodings_att_linear.expand(hyp_num,
                                                                           src_encodings_att_linear.size(1),
                                                                           src_encodings_att_linear.size(2))

            y_tm1 = torch.tensor([self.vocab.tgt[hyp[-1]] for hyp in hypotheses], dtype=torch.long, device=self.device)
            y_t_embed = self.model_embeddings.target(y_tm1)

            x = torch.cat([y_t_embed, att_tm1], dim=-1)

            (h_t, cell_t), att_t, _  = self.step(x, h_tm1,
                                                      exp_src_encodings, exp_src_encodings_att_linear, enc_masks=None)

            # log probabilities over target words
            log_p_t = F.log_softmax(self.target_vocab_projection(att_t), dim=-1)

            live_hyp_num = beam_size - len(completed_hypotheses)
            contiuating_hyp_scores = (hyp_scores.unsqueeze(1).expand_as(log_p_t) + log_p_t).view(-1)
            top_cand_hyp_scores, top_cand_hyp_pos = torch.topk(contiuating_hyp_scores, k=live_hyp_num)

            prev_hyp_ids = top_cand_hyp_pos / len(self.vocab.tgt)
            hyp_word_ids = top_cand_hyp_pos % len(self.vocab.tgt)

            new_hypotheses = []
            live_hyp_ids = []
            new_hyp_scores = []

            for prev_hyp_id, hyp_word_id, cand_new_hyp_score in zip(prev_hyp_ids, hyp_word_ids, top_cand_hyp_scores):
                prev_hyp_id = prev_hyp_id.item()
                hyp_word_id = hyp_word_id.item()
                cand_new_hyp_score = cand_new_hyp_score.item()

                hyp_word = self.vocab.tgt.id2word[hyp_word_id]
                new_hyp_sent = hypotheses[prev_hyp_id] + [hyp_word]
                if hyp_word == '</s>':
                    completed_hypotheses.append(Hypothesis(value=new_hyp_sent[1:-1],
                                                           score=cand_new_hyp_score))
                else:
                    new_hypotheses.append(new_hyp_sent)
                    live_hyp_ids.append(prev_hyp_id)
                    new_hyp_scores.append(cand_new_hyp_score)

            if len(completed_hypotheses) == beam_size:
                break

            live_hyp_ids = torch.tensor(live_hyp_ids, dtype=torch.long, device=self.device)
            h_tm1 = (h_t[live_hyp_ids], cell_t[live_hyp_ids])
            att_tm1 = att_t[live_hyp_ids]

            hypotheses = new_hypotheses
            hyp_scores = torch.tensor(new_hyp_scores, dtype=torch.float, device=self.device)

        if len(completed_hypotheses) == 0:
            completed_hypotheses.append(Hypothesis(value=hypotheses[0][1:],
                                                   score=hyp_scores[0].item()))

        completed_hypotheses.sort(key=lambda hyp: hyp.score, reverse=True)

        return completed_hypotheses

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.source.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = NMT(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
            'vocab': self.vocab,
            'state_dict': self.state_dict()
        }

        torch.save(params, path)
Esempio n. 24
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        # YOUR CODE HERE (~8 Lines)
        # TODO - Initialize the following variables:
        # self.encoder (Bidirectional LSTM with bias)
        # self.decoder (LSTM Cell with bias)
        # self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        # self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        # self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        # self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        # self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        # self.dropout (Dropout Layer)
        ###
        # Use the following docs to properly initialize these variables:
        # LSTM:
        # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        # LSTM Cell:
        # https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        # Linear Layer:
        # https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        # Dropout Layer:
        # https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               bias=True,
                               bidirectional=True)

        self.decoder = nn.LSTMCell(
            embed_size + self.hidden_size, self.hidden_size
        )  # embed_size+hidden_size means:concat input with the output of the last step.
        self.h_projection = nn.Linear(
            2 * self.hidden_size, self.hidden_size,
            bias=False)  # used to init the Hidden state of Decoder
        self.c_projection = nn.Linear(
            2 * self.hidden_size, self.hidden_size,
            bias=False)  # used to init the Cell state of Decoder

        self.att_projection = nn.Linear(
            2 * self.hidden_size, self.hidden_size, bias=False
        )  # change Encoder hidden state which is (2h, 1) to (h, 1)

        self.combined_output_projection = nn.Linear(
            3 * self.hidden_size, self.hidden_size,
            bias=False)  # attention是把Encoder的每个隐藏层乘上softmax得到的权重再求和

        self.dropout = nn.Dropout(dropout_rate)  # 用于 dropout 最后一个隐藏层状态

        self.target_vocab_projection = nn.Linear(
            self.hidden_size,
            len(vocab.tgt),
            bias=False  # 把最后的隐藏层状态 projection 到词典维度,softmax后,得到每个词的概率
        )
Esempio n. 25
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        # LSTM parameters
        # input_size – The number of expected features in the input x
        # hidden_size – The number of features in the hidden state h
        # num_layers – Number of recurrent layers. E.g., setting num_layers=2 would mean stacking two LSTMs together to form a stacked LSTM, with the second LSTM taking in outputs of the first LSTM and computing the final results. Default: 1
        # bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
        # batch_first – If True, then the input and output tensors are provided as (batch, seq, feature). Default: False
        # dropout – If non-zero, introduces a Dropout layer on the outputs of each LSTM layer except the last layer, with dropout probability equal to dropout. Default: 0
        # bidirectional – If True, becomes a bidirectional LSTM. Default: False
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               num_layers=1,
                               bias=True,
                               bidirectional=True)

        # LSTMCell parameters
        # input_size – The number of expected features in the input x
        # hidden_size – The number of features in the hidden state h
        # bias – If False, then the layer does not use bias weights b_ih and b_hh. Default: True
        self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size,
                                   hidden_size=hidden_size,
                                   bias=True)

        # Linear parameters
        # in_features – size of each input sample
        # out_features – size of each output sample
        # bias – If set to False, the layer will not learn an additive bias. Default: True
        self.h_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(in_features=2 * hidden_size,
                                        out_features=hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(in_features=3 *
                                                    hidden_size,
                                                    out_features=hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(in_features=hidden_size,
                                                 out_features=len(
                                                     self.vocab.tgt),
                                                 bias=False)

        self.dropout = nn.Dropout(p=dropout_rate)
Esempio n. 26
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        src_embed_size = 768  # given by BERT model
        tgt_embed_size = embed_size
        self.model_embeddings = ModelEmbeddings(src_embed_size, tgt_embed_size,
                                                vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout
        # self.encoder = nn.LSTM(embed_size, self.hidden_size, num_layers = 1, dropout=self.dropout_rate, bidirectional=True)
        self.encoder = nn.LSTM(src_embed_size,
                               self.hidden_size,
                               num_layers=8,
                               dropout=self.dropout_rate,
                               bidirectional=True)
        self.decoder = nn.LSTMCell(
            self.hidden_size * 2, self.hidden_size
        )  # input side of decoder is the output size of self.h_projection, so self.hidden_size
        self.h_projection = nn.Linear(
            self.hidden_size * 2, self.hidden_size, bias=False
        )  #inpu t is double of hidden size and output is hidden size
        self.c_projection = nn.Linear(self.hidden_size * 2,
                                      self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(self.hidden_size * 2,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(self.hidden_size * 3,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)  # V_t * h
        self.dropout = nn.Dropout(self.dropout_rate)
Esempio n. 27
0
    def __init__(self,
                 embed_size,
                 hidden_size,
                 vocab,
                 dropout_rate=0.2,
                 no_char_decoder=False):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src)
        self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt)

        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        self.embed_size = embed_size

        ### COPY OVER YOUR CODE FROM ASSIGNMENT 4

        # LSTM is an RNN
        self.encoder = nn.LSTM(input_size=embed_size,
                               hidden_size=hidden_size,
                               bidirectional=True,
                               bias=True)

        # LSTMCell is just one Cell
        self.decoder = nn.LSTMCell(input_size=embed_size + hidden_size,
                                   hidden_size=hidden_size,
                                   bias=True)

        self.h_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(in_features=2 * hidden_size,
                                      out_features=hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(in_features=2 * hidden_size,
                                        out_features=hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(in_features=3 *
                                                    hidden_size,
                                                    out_features=hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(in_features=hidden_size,
                                                 out_features=len(vocab.tgt),
                                                 bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)

        ### END YOUR CODE FROM ASSIGNMENT 4

        if not no_char_decoder:
            self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
            self.charDecoder = None
Esempio n. 28
0
    def __init__(self,
                 embed_size,
                 hidden_size,
                 vocab,
                 dropout_rate=0.2,
                 spectrum_cnn_kernel_size=3,
                 location_attention_window=64,
                 no_char_decoder=False):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()

        # self.voiceCNN = VoiceCNN(embed_size, 5)
        self.location_attention_window = location_attention_window
        self.spectrum_cnn_kernel_size = spectrum_cnn_kernel_size
        self.spectrumCNN = nn.Conv1d(embed_size, embed_size,
                                     self.spectrum_cnn_kernel_size)
        # self.model_embeddings_source = ModelEmbeddings(embed_size, vocab.src)
        self.model_embeddings_target = ModelEmbeddings(embed_size, vocab.tgt)

        self.embed_size = embed_size
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # COPY OVER YOUR CODE FROM ASSIGNMENT 4

        self.encoder = torch.nn.LSTM(embed_size,
                                     hidden_size,
                                     bidirectional=True)
        self.decoder = torch.nn.LSTMCell(embed_size + hidden_size, hidden_size)
        self.h_projection = torch.nn.Linear(2 * hidden_size,
                                            hidden_size,
                                            bias=False)
        self.c_projection = torch.nn.Linear(2 * hidden_size,
                                            hidden_size,
                                            bias=False)
        self.loc_window = 5
        self.loc_att_projection = torch.nn.Linear(embed_size, 1, bias=False)
        self.loc_att_conv1D = nn.Conv1d(self.loc_window, embed_size, 1)
        self.att_projection = torch.nn.Linear(2 * hidden_size,
                                              hidden_size,
                                              bias=False)
        self.combined_output_projection = torch.nn.Linear(3 * hidden_size,
                                                          hidden_size,
                                                          bias=False)
        self.target_vocab_projection = torch.nn.Linear(hidden_size,
                                                       len(vocab.tgt),
                                                       bias=False)
        self.dropout = nn.Dropout(p=dropout_rate)

        # END YOUR CODE FROM ASSIGNMENT 4

        if not no_char_decoder:
            self.charDecoder = CharDecoder(hidden_size, target_vocab=vocab.tgt)
        else:
            self.charDecoder = None
Esempio n. 29
0
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate=0.2):
        """ Init NMT Model.

        @param embed_size (int): Embedding size (dimensionality)
        @param hidden_size (int): Hidden Size (dimensionality)
        @param vocab (Vocab): Vocabulary object containing src and tgt languages
                              See vocab.py for documentation.
        @param dropout_rate (float): Dropout probability, for attention
        """
        super(NMT, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab

        # default values
        self.encoder = None
        self.decoder = None
        self.h_projection = None
        self.c_projection = None
        self.att_projection = None
        self.combined_output_projection = None
        self.target_vocab_projection = None
        self.dropout = None

        ### YOUR CODE HERE (~8 Lines)
        ### TODO - Initialize the following variables:
        ###     self.encoder (Bidirectional LSTM with bias)
        ###     self.decoder (LSTM Cell with bias)
        ###     self.h_projection (Linear Layer with no bias), called W_{h} in the PDF.
        ###     self.c_projection (Linear Layer with no bias), called W_{c} in the PDF.
        ###     self.att_projection (Linear Layer with no bias), called W_{attProj} in the PDF.
        ###     self.combined_output_projection (Linear Layer with no bias), called W_{u} in the PDF.
        ###     self.target_vocab_projection (Linear Layer with no bias), called W_{vocab} in the PDF.
        ###     self.dropout (Dropout Layer)
        ###
        ### Use the following docs to properly initialize these variables:
        ###     LSTM:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTM
        ###     LSTM Cell:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.LSTMCell
        ###     Linear Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Linear
        ###     Dropout Layer:
        ###         https://pytorch.org/docs/stable/nn.html#torch.nn.Dropout

        ### (c)按照TODO中要求进行初始化

        ## LSTM层,输入的参数列表包括
        ##input_size:输入维度(即词向量的维度)
        ##hidden_size:隐藏层的维度(h的维度)
        ##num_layers:LSTM的层数(纵向堆叠深度)
        ##bias:是否需要偏置,默认为True
        ##batch_first:是否需要调换将batch作为第一维度(针对非(batch_size,seq_length,embedding_dim)的输入),默认为False
        ##dropout:dropout,默认为0
        ##bidirectional:LSTM双向与否,默认False
        self.encoder = nn.LSTM(embed_size,
                               self.hidden_size,
                               bias=True,
                               bidirectional=True)

        ## LSTMCell层,单个的LSTM单元(结构上也恰好构成单层LSTM)
        ##(与LSTM不同处在于,LSTMCell输入为单个的x_t,而LSTM为序列x_0...x_T。如果需要跑完整个序列LSTMCell需要使用循环)
        ## 输入的参数列表包括
        ##input_size:输入维度(即词向量的维度)
        ##hidden_size:隐藏层的维度(h的维度)
        ##bias:是否需要偏置,默认为True
        self.decoder = nn.LSTMCell(embed_size + self.hidden_size,
                                   self.hidden_size,
                                   bias=True)

        ## 线性层,顾名思义。参数依次为:input_dimension,output_dimension,是否需要bias
        self.h_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.c_projection = nn.Linear(2 * self.hidden_size,
                                      self.hidden_size,
                                      bias=False)
        self.att_projection = nn.Linear(2 * self.hidden_size,
                                        self.hidden_size,
                                        bias=False)
        self.combined_output_projection = nn.Linear(3 * self.hidden_size,
                                                    self.hidden_size,
                                                    bias=False)
        self.target_vocab_projection = nn.Linear(self.hidden_size,
                                                 len(self.vocab.tgt),
                                                 bias=False)

        ##dropout层,设置dropout_rate
        self.dropout = nn.Dropout(dropout_rate)
Esempio n. 30
0
class Node2(nn.Module):
    """
    Node Class that inherits the BiLSTM models created in bilstm_model.py
    Beginning Node models have 2 BiLSTMs, future versions can have more
    
    """
    def __init__(self, embed_size, hidden_size, vocab, dropout_rate):
        super(Node2, self).__init__()
        self.model_embeddings = ModelEmbeddings(embed_size, vocab)
        self.hidden_size = hidden_size
        self.dropout_rate = dropout_rate
        self.vocab = vocab
        print("vocab.num_labels: ", vocab.num_labels)
        self.num_labels = vocab.num_labels

        self.encoder0 = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            bias=True,
            # dropout=self.dropout_rate,
            bidirectional=True)

        self.encoder1 = nn.LSTM(
            input_size=embed_size,
            hidden_size=hidden_size,
            bias=True,
            # dropout=self.dropout_rate,
            bidirectional=True)

        self.dropout1 = nn.Dropout()

        self.attention_projection = nn.Linear(in_features=2 * hidden_size,
                                              out_features=self.num_labels,
                                              bias=False)
        self.attention_softmax = nn.Softmax(dim=0)
        #         self.labels_projection = nn.Linear(in_features=2*hidden_size,
        #                                           out_features=1,
        #                                           bias=False)
        self.labels_projection = nn.Linear(in_features=2 * hidden_size,
                                           out_features=100,
                                           bias=False)

        self.labels_projection2 = nn.Linear(in_features=100,
                                            out_features=1,
                                            bias=False)

    def forward(self, in_sents: List[List[str]],
                target_labels: List[List[int]]):

        # in_sents should be (1000, whatever)
        # split in half
        num_notes = len(in_sents)
        length_of_each_note = int(len(in_sents[0]) / 2)

        # Convert list of lists into tensors
        source_padded = self.vocab.notes_.to_input_tensor(in_sents,
                                                          device=self.device)
        # Tensor: (src_len, b)
        #         print(num_notes)

        X = self.model_embeddings.note_embeds(source_padded)
        #         print("X.shape: ", X.shape) # (1000, 16, 256)
        #         print(in_sents[0])
        X0 = X[:length_of_each_note, :, :]
        X1 = X[length_of_each_note:, :, :]

        enc_hiddens0, _ = self.encoder0(X0)
        enc_hiddens1, _ = self.encoder1(X1)
        #         print(enc_hiddens0.shape)
        #         print(enc_hiddens1.shape)

        enc_hiddens = torch.cat([enc_hiddens0, enc_hiddens1], 0)
        #         print(enc_hiddens.shape)

        #         scores0 = self.first_bilstm(in_sents0,target_labels)
        #         scores1 = self.second_bilstm(in_sents1,target_labels)
        #         print(scores0[0])
        #         print(scores1[0])

        alpha = self.attention_projection(enc_hiddens)
        #         print("alpha.shape: ", alpha.shape)

        #         alpha = self.dropout1(alpha)

        alpha_soft = self.attention_softmax(alpha)
        #         print(np.sum(alpha_soft.detach().numpy(),axis=0))

        M = torch.bmm(alpha_soft.permute([1, 2, 0]),
                      enc_hiddens.permute([1, 0, 2]))
        #         print("M.shape: ", M.shape)
        #         torch.stack(combined_outputs, dim=0)

        M = self.dropout1(M)

        M = self.labels_projection(M)

        M = F.relu(M)

        scores = self.labels_projection2(M)

        scores = torch.sigmoid(torch.squeeze(scores, -1))

        #         print("scores.shape: ", scores.shape)

        return scores

# #         print("alpha_soft.shape: ", alpha_soft.shape)
# #         print("alpha_permuted shape: ", alpha_soft.permute([2,1,0]).shape)

#         M = torch.bmm(alpha_soft.permute([1,2,0]), enc_hiddens.permute([1,0,2]))
# #         print("M.shape: ", M.shape)
# #         torch.stack(combined_outputs, dim=0)

#         scores = self.labels_projection(M)
# #         print(scores)
# #         print(F.sigmoid(scores))
#         scores = torch.sigmoid(torch.squeeze(scores,-1))
# #         print("scores.shape: ", scores.shape)
# #         print("scores squeezed shape: ", torch.squeeze(scores,-1).shape)
# #         print("scores: \n", scores)
# #         scores = 0.

#         return scores

    def encode(
        self, source_padded: torch.Tensor, source_lengths: List[int]
    ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
        pass

    @property
    def device(self) -> torch.device:
        """ Determine which device to place the Tensors upon, CPU or GPU.
        """
        return self.model_embeddings.note_embeds.weight.device

    @staticmethod
    def load(model_path: str):
        """ Load the model from a file.
        @param model_path (str): path to model
        """
        #         params = torch.load(model_path, map_location=lambda storage, loc: storage)
        #         args = params['args']
        #         model = NMT(vocab=params['vocab'], **args)
        #         model.load_state_dict(params['state_dict'])

        params = torch.load(model_path,
                            map_location=lambda storage, loc: storage)
        args = params['args']
        model = Node(vocab=params['vocab'], **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the odel to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        #         params = {
        #             'args': dict(embed_size=self.model_embeddings.embed_size, hidden_size=self.hidden_size, dropout_rate=self.dropout_rate),
        #             'vocab': self.vocab,
        #             'state_dict': self.state_dict()
        #         }

        params = {
            'args':
            dict(embed_size=self.model_embeddings.embed_size,
                 hidden_size=self.hidden_size,
                 dropout_rate=self.dropout_rate),
            'vocab':
            self.vocab,
            'state_dict':
            self.state_dict()
        }

        torch.save(params, path)