def __init__(self, params): super(DecoderLayer, self).__init__() self.layer_norm = nn.LayerNorm(params.hidden_dim, eps=1e-6) self.self_attention = MultiHeadAttention( params) # masked multi head attention 부분 self.encoder_attention = MultiHeadAttention( params) # multi-head attention 부분 self.position_wise_ffn = PositionWiseFeedForward( params) #FFNN sublayer
def __init__(self, model_dim, heads_num): super().__init__() self.lnorm_1 = nn.LayerNorm(model_dim) self.attention_1 = MultiHeadAttention(heads_num, model_dim) self.dropout_1 = nn.Dropout(0.1) self.lnorm_2 = nn.LayerNorm(model_dim) self.attention_2 = MultiHeadAttention(heads_num, model_dim) self.dropout_2 = nn.Dropout(0.1) self.lnorm_3 = nn.LayerNorm(model_dim) self.ff = FeedForward(model_dim) self.dropout_3 = nn.Dropout(0.1)
def __init__(self, d_model, n_head, d_k, d_v, dropout=0.1): super(StarEncoderLayer, self).__init__() self.slf_attn_satellite = MultiHeadAttention(n_head, d_model, d_k, d_v, use_star=True, dropout=dropout) self.slf_attn_relay = MultiHeadAttention(n_head, d_model, d_k, d_v, use_star=True, dropout=dropout)
def __init__( self, num_layers, num_input_features, bn_size, growth_rate, drop_rate, num_heads, total_key_filters, total_value_filters, output_filters): super(DenseBlock, self).__init__() self.denselayers = nn.ModuleDict() self.multi_attn = MultiHeadAttention( num_heads=num_heads, num_input_channels=num_input_features, total_key_filters=total_key_filters, total_value_filters=total_value_filters, output_filters=output_filters) for i in range(num_layers): layer = _DenseLayer( num_input_features + i * growth_rate, growth_rate, bn_size, drop_rate) self.denselayers.add_module('denselayer%d' % (i + 1), layer)
def __init__(self, seq_len, d_model, h, dropout=0.5): super().__init__() self.attention = MultiHeadAttention(d_model, h) self.layer_norm = nn.LayerNorm(torch.Size([seq_len, d_model])) self.residential = Residential() self.dropout = nn.Dropout(dropout) self.transition = nn.Linear(d_model, d_model) self.pos_embedding = PositionalEncoding(d_model, seq_len)
def __init__(self, params): super(EncoderLayer, self).__init__() self.layer_norm = nn.LayerNorm(params.hidden_dim, eps=1e-6) #layer normalization 진행하는 부분 self.self_attention = MultiHeadAttention( params) # MultiheadAttention 진행하는 부분 self.position_wise_ffn = PositionWiseFeedForward( params) # PositionWiseFeedForward 네트워크 통과
def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1): super(Attblock, self).__init__() self.att_residual = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout) self.fc1 = nn.Linear(d_model, d_model) self.fc2 = nn.Linear(d_model, d_model) self.layer_norm = nn.LayerNorm(d_model, eps=1e-6) self.dropout = nn.Dropout(dropout)
def __init__(self, num_heads, d_model, dff, rate=0.1): super(DecoderLayer, self).__init__() mha_param = {"num_heads": num_heads, "d_model": d_model} self.masked_mha = MultiHeadAttention(**mha_param) self.mha = MultiHeadAttention(**mha_param) ffn_params = {"dff": dff, "d_model": d_model} self.ffn = PointWiseFeedForwardNetwork(**ffn_params) self.layer_norm1 = keras.layers.LayerNormalization(epsilon=1e-6) self.layer_norm2 = keras.layers.LayerNormalization(epsilon=1e-6) self.layer_norm3 = keras.layers.LayerNormalization(epsilon=1e-6) self.dropout1 = keras.layers.Dropout(rate) self.dropout2 = keras.layers.Dropout(rate) self.dropout3 = keras.layers.Dropout(rate)
def __init__(self, d_model, d_inner, n_head, d_k, d_v, dropout=0.1): super(EncoderLayer, self).__init__() self.slf_attn = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout) self.pos_ffn = PositionwiseFeedForward(d_model, d_inner, dropout=dropout)
def __init__(self, params): super(DecoderLayer, self).__init__() self.layer_norm = nn.LayerNorm(params.hidden_dim) self.self_attention = MultiHeadAttention(params) self.encoder_attention = MultiHeadAttention(params) self.position_wise_ffn = PositionWiseFeedForward(params)
def forward(self, input, skip_input_list, hidden=None, attn=None): """ input: variable (batch, seq_len), batch = 1 skip_input_list: [skip_input, volatile_flag] skip_input: three dimension list, with length is seq_len. Each element is a list of matched word id and its length. example: [[], [[25,13],[2,3]]] 25/13 is word id, 2,3 is word length . """ volatile_flag = skip_input_list[1] skip_input = skip_input_list[0] if not self.left2right: skip_input = convert_forward_gaz_to_backward(skip_input) input = input.transpose(1, 0) seq_len = input.size(0) if self.use_attn and not config['shared_attn']: Attn = MultiHeadAttention(self.n_head, input.size(2), int(input.size(2) / self.n_head), int(input.size(2) / self.n_head)) if self.gpu: Attn = Attn.cuda() q = input if self.gpu: q = q.cuda() q = q.permute(1, 0, 2) output, _ = Attn.forward(q, q, q) batch_size = input.size(1) assert (batch_size == 1) hidden_out = [] memory_out = [] if hidden: (hx, cx) = hidden else: hx = autograd.Variable(torch.zeros(batch_size, self.hidden_dim)) cx = autograd.Variable(torch.zeros(batch_size, self.hidden_dim)) if self.gpu: hx = hx.cuda() cx = cx.cuda() id_list = list(range(seq_len)) if not self.left2right: id_list = list(reversed(id_list)) '''i deleted''' # input_c_list = init_list_of_objects(seq_len) for t in id_list: if skip_input[t]: matched_num = len(skip_input[t][0]) word_var = autograd.Variable(torch.LongTensor( skip_input[t][0]), volatile=volatile_flag) if self.gpu: word_var = word_var.cuda() word_emb = self.word_emb(word_var) word_emb = self.word_dropout(word_emb) # ct=self.word_rnn(word_emb,(hx,cx)) # assert(ct.size(0)==len(skip_input[t][1])) # for idx in range(matched_num): # length=skip_input[t][1][idx] # if self.left2right: # input_c_list[t].append(ct[idx,:].unsqueeze(0)) if self.use_attn: if config['shared_attn']: hx, cx = self.rnn(input[t], word_emb, (hx, cx), attn) if not config['shared_attn']: hx, cx = self.rnn(input[t], word_emb, (hx, cx), output) else: hx, cx = self.rnn(input[t], word_emb, (hx, cx)) else: if self.use_attn: if config['shared_attn']: hx, cx = self.rnn(input[t], [], (hx, cx), attn) if not config['shared_attn']: hx, cx = self.rnn(input[t], [], (hx, cx), output) else: hx, cx = self.rnn(input[t], [], (hx, cx)) hidden_out.append(hx) memory_out.append(cx) if not self.left2right: hidden_out = list(reversed(hidden_out)) memory_out = list(reversed(memory_out)) output_hidden, output_memory = torch.cat(hidden_out, 0), torch.cat(memory_out, 0) # (batch, seq_len, hidden_dim) # print output_hidden.size() return output_hidden.unsqueeze(0), output_memory.unsqueeze(0)
def _encoder_layer(d_model, d_k, d_v, n_heads): return EncoderLayer(d_model, [ MultiHeadAttention(d_model, d_k, d_v, n_heads), PositionWiseFeedForwardNet(d_model, d_hidden) ])
def get_lstm_features(self, gaz_list, word_inputs, biword_inputs, word_seq_lengths, char_inputs, char_seq_lengths, char_seq_recover): """ input: word_inputs: (batch_size, sent_len) gaz_list: word_seq_lengths: list of batch_size, (batch_size,1) char_inputs: (batch_size*sent_len, word_length) char_seq_lengths: list of whole batch_size for char, (batch_size*sent_len, 1) char_seq_recover: variable which records the char order information, used to recover char order output: Variable(sent_len, batch_size, hidden_dim) """ batch_size = word_inputs.size(0) sent_len = word_inputs.size(1) word_embs = self.word_embeddings(word_inputs) if self.use_bigram: biword_embs = self.biword_embeddings(biword_inputs) word_embs = torch.cat([word_embs, biword_embs], 2) if self.use_char: ## calculate char lstm last hidden char_features = self.char_feature.get_last_hiddens( char_inputs, char_seq_lengths.cpu().numpy()) char_features = char_features[char_seq_recover] char_features = char_features.view(batch_size, sent_len, -1) ## concat word and char together word_embs = torch.cat([word_embs, char_features], 2) word_embs = self.drop(word_embs) # packed_words = pack_padded_sequence(word_embs, word_seq_lengths.cpu().numpy(), True) hidden = None if self.use_attn and config['shared_attn']: input = word_embs.transpose(1, 0) attn = MultiHeadAttention(self.n_head, input.size(2), int(input.size(2) / self.n_head), int(input.size(2) / self.n_head), gpu=self.gpu) if self.gpu: attn = attn.cuda() q = input if self.gpu: q = q.cuda() q = q.permute(1, 0, 2) output, _ = attn.forward(q, q, q) lstm_out, hidden = self.forward_lstm(word_embs, gaz_list, hidden=hidden, attn=output) else: lstm_out, hidden = self.forward_lstm(word_embs, gaz_list, hidden=hidden) if self.bilstm_flag: backward_hidden = None if self.use_attn and config['shared_attn']: backward_lstm_out, backward_hidden = self.backward_lstm( word_embs, gaz_list, hidden=backward_hidden, attn=output) else: backward_lstm_out, backward_hidden = self.backward_lstm( word_embs, gaz_list, hidden=backward_hidden) lstm_out = torch.cat([lstm_out, backward_lstm_out], 2) # lstm_out, _ = pad_packed_sequence(lstm_out) lstm_out = self.droplstm(lstm_out) return lstm_out
def test_MultiHeadAttention(self): mha = MultiHeadAttention(512, 1024) q = torch.randn(5, 4, 512) v = torch.randn(5, 10, 1024) k = torch.randn(5, 10, 512) assert_list_equal(mha(v, k, q).shape, [5, 4, 1024])
def _decoder_layer(self, d_model, d_k, d_v, n_heads): return DecoderLayer(d_model, [ MultiHeadAttention(d_model, d_k, d_v, n_heads), # self-attention MultiHeadAttention(d_model, d_k, d_v, n_heads), # encoder-decoder attention FeedForwardLayer(d_model, d_hidden) ])