Example #1
0
 def __init__(self, params):
     super(DecoderLayer, self).__init__()
     self.layer_norm = nn.LayerNorm(params.hidden_dim, eps=1e-6)
     self.self_attention = MultiHeadAttention(
         params)  # masked multi head attention 부분
     self.encoder_attention = MultiHeadAttention(
         params)  # multi-head attention 부분
     self.position_wise_ffn = PositionWiseFeedForward(
         params)  #FFNN sublayer
Example #2
0
    def __init__(self, model_dim, heads_num):
        super().__init__()
        self.lnorm_1 = nn.LayerNorm(model_dim)
        self.attention_1 = MultiHeadAttention(heads_num, model_dim)
        self.dropout_1 = nn.Dropout(0.1)

        self.lnorm_2 = nn.LayerNorm(model_dim)
        self.attention_2 = MultiHeadAttention(heads_num, model_dim)
        self.dropout_2 = nn.Dropout(0.1)

        self.lnorm_3 = nn.LayerNorm(model_dim)
        self.ff = FeedForward(model_dim)
        self.dropout_3 = nn.Dropout(0.1)
 def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1):
     super(StarEncoderLayer, self).__init__()
     self.slf_attn_satellite = MultiHeadAttention(n_head,
                                                  d_model,
                                                  d_k,
                                                  d_v,
                                                  use_star=True,
                                                  dropout=dropout)
     self.slf_attn_relay = MultiHeadAttention(n_head,
                                              d_model,
                                              d_k,
                                              d_v,
                                              use_star=True,
                                              dropout=dropout)
Example #4
0
    def __init__(
            self,
            num_layers,
            num_input_features,
            bn_size,
            growth_rate,
            drop_rate,
            num_heads,
            total_key_filters,
            total_value_filters,
            output_filters):
        super(DenseBlock, self).__init__()
        self.denselayers = nn.ModuleDict()
        self.multi_attn = MultiHeadAttention(
            num_heads=num_heads,
            num_input_channels=num_input_features,
            total_key_filters=total_key_filters,
            total_value_filters=total_value_filters,
            output_filters=output_filters)

        for i in range(num_layers):
            layer = _DenseLayer(
                num_input_features +
                i *
                growth_rate,
                growth_rate,
                bn_size,
                drop_rate)
            self.denselayers.add_module('denselayer%d' % (i + 1), layer)
 def __init__(self, seq_len, d_model, h, dropout=0.5):
     super().__init__()
     self.attention = MultiHeadAttention(d_model, h)
     self.layer_norm = nn.LayerNorm(torch.Size([seq_len, d_model]))
     self.residential = Residential()
     self.dropout = nn.Dropout(dropout)
     self.transition = nn.Linear(d_model, d_model)
     self.pos_embedding = PositionalEncoding(d_model, seq_len)
Example #6
0
 def __init__(self, params):
     super(EncoderLayer, self).__init__()
     self.layer_norm = nn.LayerNorm(params.hidden_dim,
                                    eps=1e-6)  #layer normalization 진행하는 부분
     self.self_attention = MultiHeadAttention(
         params)  # MultiheadAttention 진행하는 부분
     self.position_wise_ffn = PositionWiseFeedForward(
         params)  # PositionWiseFeedForward 네트워크 통과
Example #7
0
 def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
     super(Attblock, self).__init__()
     self.att_residual = MultiHeadAttention(n_head, d_model, d_k, d_v,
                                            dropout)
     self.fc1 = nn.Linear(d_model, d_model)
     self.fc2 = nn.Linear(d_model, d_model)
     self.layer_norm = nn.LayerNorm(d_model, eps=1e-6)
     self.dropout = nn.Dropout(dropout)
Example #8
0
    def __init__(self, num_heads, d_model, dff, rate=0.1):
        super(DecoderLayer, self).__init__()

        mha_param = {"num_heads": num_heads, "d_model": d_model}
        self.masked_mha = MultiHeadAttention(**mha_param)
        self.mha = MultiHeadAttention(**mha_param)

        ffn_params = {"dff": dff, "d_model": d_model}
        self.ffn = PointWiseFeedForwardNetwork(**ffn_params)

        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layer_norm3 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)
        self.dropout3 = keras.layers.Dropout(rate)
 def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1):
     super(EncoderLayer, self).__init__()
     self.slf_attn = MultiHeadAttention(n_head,
                                        d_model,
                                        d_k,
                                        d_v,
                                        dropout=dropout)
     self.pos_ffn = PositionwiseFeedForward(d_model,
                                            d_inner,
                                            dropout=dropout)
Example #10
0
 def __init__(self, params):
     super(DecoderLayer, self).__init__()
     self.layer_norm = nn.LayerNorm(params.hidden_dim)
     self.self_attention = MultiHeadAttention(params)
     self.encoder_attention = MultiHeadAttention(params)
     self.position_wise_ffn = PositionWiseFeedForward(params)
Example #11
0
    def forward(self, input, skip_input_list, hidden=None, attn=None):
        """
            input: variable (batch, seq_len), batch = 1
            skip_input_list: [skip_input, volatile_flag]
            skip_input: three dimension list, with length is seq_len. Each element is a list of matched word id and its length.
                        example: [[], [[25,13],[2,3]]] 25/13 is word id, 2,3 is word length .
        """

        volatile_flag = skip_input_list[1]
        skip_input = skip_input_list[0]
        if not self.left2right:
            skip_input = convert_forward_gaz_to_backward(skip_input)
        input = input.transpose(1, 0)
        seq_len = input.size(0)

        if self.use_attn and not config['shared_attn']:
            Attn = MultiHeadAttention(self.n_head, input.size(2),
                                      int(input.size(2) / self.n_head),
                                      int(input.size(2) / self.n_head))
            if self.gpu:
                Attn = Attn.cuda()

            q = input

            if self.gpu:
                q = q.cuda()
            q = q.permute(1, 0, 2)

            output, _ = Attn.forward(q, q, q)

        batch_size = input.size(1)
        assert (batch_size == 1)
        hidden_out = []
        memory_out = []
        if hidden:
            (hx, cx) = hidden
        else:
            hx = autograd.Variable(torch.zeros(batch_size, self.hidden_dim))
            cx = autograd.Variable(torch.zeros(batch_size, self.hidden_dim))
            if self.gpu:
                hx = hx.cuda()
                cx = cx.cuda()

        id_list = list(range(seq_len))
        if not self.left2right:
            id_list = list(reversed(id_list))
        '''i deleted'''
        # input_c_list = init_list_of_objects(seq_len)

        for t in id_list:
            if skip_input[t]:
                matched_num = len(skip_input[t][0])
                word_var = autograd.Variable(torch.LongTensor(
                    skip_input[t][0]),
                                             volatile=volatile_flag)
                if self.gpu:
                    word_var = word_var.cuda()
                word_emb = self.word_emb(word_var)
                word_emb = self.word_dropout(word_emb)
                # ct=self.word_rnn(word_emb,(hx,cx))
                # assert(ct.size(0)==len(skip_input[t][1]))
                # for idx in range(matched_num):
                #     length=skip_input[t][1][idx]
                #     if self.left2right:
                #         input_c_list[t].append(ct[idx,:].unsqueeze(0))
                if self.use_attn:
                    if config['shared_attn']:
                        hx, cx = self.rnn(input[t], word_emb, (hx, cx), attn)
                    if not config['shared_attn']:
                        hx, cx = self.rnn(input[t], word_emb, (hx, cx), output)
                else:
                    hx, cx = self.rnn(input[t], word_emb, (hx, cx))
            else:
                if self.use_attn:
                    if config['shared_attn']:
                        hx, cx = self.rnn(input[t], [], (hx, cx), attn)
                    if not config['shared_attn']:
                        hx, cx = self.rnn(input[t], [], (hx, cx), output)
                else:
                    hx, cx = self.rnn(input[t], [], (hx, cx))

            hidden_out.append(hx)
            memory_out.append(cx)

        if not self.left2right:
            hidden_out = list(reversed(hidden_out))
            memory_out = list(reversed(memory_out))
        output_hidden, output_memory = torch.cat(hidden_out,
                                                 0), torch.cat(memory_out, 0)
        # (batch, seq_len, hidden_dim)
        # print output_hidden.size()
        return output_hidden.unsqueeze(0), output_memory.unsqueeze(0)
 def _encoder_layer(d_model, d_k, d_v, n_heads):
     return EncoderLayer(d_model, [
         MultiHeadAttention(d_model, d_k, d_v, n_heads),
         PositionWiseFeedForwardNet(d_model, d_hidden)
     ])
Example #13
0
    def get_lstm_features(self, gaz_list, word_inputs, biword_inputs,
                          word_seq_lengths, char_inputs, char_seq_lengths,
                          char_seq_recover):
        """
            input:
                word_inputs: (batch_size, sent_len)
                gaz_list:
                word_seq_lengths: list of batch_size, (batch_size,1)
                char_inputs: (batch_size*sent_len, word_length)
                char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1)
                char_seq_recover: variable which records the char order information, used to recover char order
            output:
                Variable(sent_len, batch_size, hidden_dim)
        """
        batch_size = word_inputs.size(0)
        sent_len = word_inputs.size(1)
        word_embs = self.word_embeddings(word_inputs)
        if self.use_bigram:
            biword_embs = self.biword_embeddings(biword_inputs)
            word_embs = torch.cat([word_embs, biword_embs], 2)
        if self.use_char:
            ## calculate char lstm last hidden
            char_features = self.char_feature.get_last_hiddens(
                char_inputs,
                char_seq_lengths.cpu().numpy())
            char_features = char_features[char_seq_recover]
            char_features = char_features.view(batch_size, sent_len, -1)
            ## concat word and char together
            word_embs = torch.cat([word_embs, char_features], 2)
        word_embs = self.drop(word_embs)
        # packed_words = pack_padded_sequence(word_embs, word_seq_lengths.cpu().numpy(), True)
        hidden = None

        if self.use_attn and config['shared_attn']:

            input = word_embs.transpose(1, 0)

            attn = MultiHeadAttention(self.n_head,
                                      input.size(2),
                                      int(input.size(2) / self.n_head),
                                      int(input.size(2) / self.n_head),
                                      gpu=self.gpu)
            if self.gpu:
                attn = attn.cuda()

            q = input

            if self.gpu:
                q = q.cuda()
            q = q.permute(1, 0, 2)

            output, _ = attn.forward(q, q, q)
            lstm_out, hidden = self.forward_lstm(word_embs,
                                                 gaz_list,
                                                 hidden=hidden,
                                                 attn=output)

        else:
            lstm_out, hidden = self.forward_lstm(word_embs,
                                                 gaz_list,
                                                 hidden=hidden)

        if self.bilstm_flag:
            backward_hidden = None

            if self.use_attn and config['shared_attn']:
                backward_lstm_out, backward_hidden = self.backward_lstm(
                    word_embs, gaz_list, hidden=backward_hidden, attn=output)
            else:
                backward_lstm_out, backward_hidden = self.backward_lstm(
                    word_embs, gaz_list, hidden=backward_hidden)
            lstm_out = torch.cat([lstm_out, backward_lstm_out], 2)
        # lstm_out, _ = pad_packed_sequence(lstm_out)
        lstm_out = self.droplstm(lstm_out)
        return lstm_out
Example #14
0
 def test_MultiHeadAttention(self):
     mha = MultiHeadAttention(512, 1024)
     q = torch.randn(5, 4, 512)
     v = torch.randn(5, 10, 1024)
     k = torch.randn(5, 10, 512)
     assert_list_equal(mha(v, k, q).shape, [5, 4, 1024])
 def _decoder_layer(self, d_model, d_k, d_v, n_heads):
     return DecoderLayer(d_model, [
         MultiHeadAttention(d_model, d_k, d_v, n_heads), # self-attention
         MultiHeadAttention(d_model, d_k, d_v, n_heads), # encoder-decoder attention
         FeedForwardLayer(d_model, d_hidden)    
     ])