class GeneratorN(nn.Module): def __init__(self, use_self_attention=False): super().__init__() self.residuals = nn.Sequential(*[Residual(D_GF * 2) for _ in range(RESIDUALS)]) self.attn = Attention(D_GF, D_HIDDEN) self.upsample = upsample_block(D_GF * 2, D_GF) self.use_self_attention = use_self_attention if self.use_self_attention: self.self_attn = self_attn_block() p_trainable, p_non_trainable = count_params(self) print(f'GeneratorN params: trainable {p_trainable} - non_trainable {p_non_trainable}') def forward(self, h_code, c_code, word_embs, mask): """ h_code1(query), output of previous generator: batch x D_GF x ih x iw (queryL=ihxiw) word_embs(context): batch x D_COND x seq_len c_code1: batch x D_GF x ih x iw att1: batch x sourceL x ih x iw """ self.attn.applyMask(mask) c_code, att = self.attn(h_code, word_embs) # Image-text attention first, image-image attention second if self.use_self_attention: c_code = self.self_attn(c_code) out_code = torch.cat((h_code, c_code), 1) out_code = self.residuals(out_code) out_code = self.upsample(out_code) # D_GF/2 x 2ih x 2iw return out_code, att
def __init__(self, vocab_size, pos_size, word_embeddings=None): super(CorefTagger, self).__init__() self.vocab_size = vocab_size self.pos_size = pos_size self.WordEmbedding = nn.Embedding(self.vocab_size + 1, EMBEDDING_DIM) if word_embeddings is not None: self.WordEmbedding.weight = nn.Parameter( torch.from_numpy(word_embeddings).type(torch.cuda.FloatTensor)) # print("word embedding size:", self.WordEmbedding.weight.size()) self.WordLSTM = nn.LSTM(EMBEDDING_DIM, 256, num_layers=1, batch_first=True, bidirectional=True) self.Attention = Attention(256 * 2) self.PosEmbedding = nn.Embedding(self.pos_size + 1, self.pos_size + 1) self.PosEmbedding.weight = nn.Parameter( torch.eye(self.pos_size + 1).type(torch.cuda.FloatTensor)) self.PosLSTM = nn.LSTM(self.pos_size + 1, 16, num_layers=1, batch_first=True, bidirectional=True) self.AttentionLSTM = Attention(16 * 2) self.PairHidden_1 = nn.Linear(2 * (512 + 32) + 2 + 1, 256) self.PairHidden_2 = nn.Linear(256, 128) self.Context = nn.Linear(128, 128) self.Decoder = nn.Linear(256, 64) # self.Harmonize = nn.Linear(64 * 3, 8) self.Out = nn.Linear(64 * 2, 2) self.optimizer = optim.SGD(self.parameters(), lr=0.01, weight_decay=0)
def __init__(self, params, device): super(Decoder, self).__init__() self.device = device self.prenet = Prenet() self.attention_rnn = nn.LSTMCell(256 + 512, 1024) self.attention_layer = Attention(1024, 512, 128, 32, 31) self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024) self.linear_projection = nn.Linear(1024 + 512, 80) self.gate_layer = nn.Linear(1024 + 512, 1)
def __init__(self, use_self_attention=False): super().__init__() self.residuals = nn.Sequential(*[Residual(D_GF * 2) for _ in range(RESIDUALS)]) self.attn = Attention(D_GF, D_HIDDEN) self.upsample = upsample_block(D_GF * 2, D_GF) self.use_self_attention = use_self_attention if self.use_self_attention: self.self_attn = self_attn_block() p_trainable, p_non_trainable = count_params(self) print(f'GeneratorN params: trainable {p_trainable} - non_trainable {p_non_trainable}')
def __init__(self, name, data, path=None): self.name = name self.data = data n_voc = data.n_voc n_lab = data.n_lab ndim = data.ndim #build model x = T.imatrix('x') m = T.fmatrix('mask') y = T.ivector('y') is_train = T.iscalar('train_flag') self.layers = [] self.layers.append(Embedding('embedding', x, n_voc, ndim, path)) self.layers.append(LSTM('lstm', self.layers[-1].output, m, ndim, ndim, ndim, path)) self.layers.append(Attention('attention', self.layers[-1].output, T.mean(self.layers[-1].output, 0), m, ndim, ndim, ndim, path)) self.layers.append(Dense('full_connection', self.layers[-1].output, ndim, ndim, path)) self.layers.append(Dropout('dropout', self.layers[-1].output, 0.5, is_train, path)) self.layers.append(Dense('softmax', self.layers[-1].output, ndim, int(n_lab), path, activation=T.nnet.softmax)) #define cost function self.cost = -T.mean(T.log(self.layers[-1].output)[T.arange(y.shape[0]), y], acc_dtype='float32') correct = T.sum(T.eq(T.argmax(self.layers[-1].output, axis=1), y), acc_dtype='int32') #get grads of params params = [] for layer in self.layers: params += list(layer.params.values()) gparams = T.grad(self.cost, wrt=params) updates = adagrad(params, gparams) #define training model and test model self.train_model = theano.function( inputs=[is_train, x, m, y], outputs=self.cost, updates=updates) self.acc_model = theano.function( inputs=[is_train, x, m, y], outputs=[correct]) self.index = {} self.index['valid'] = 1 self.index['test']= 1 self.best_valid_acc = 0.0 self.out_len = 0
def __init__(self, embed_dim, decoder_dim, vocab_size, encoder_dim=2048, dropout=0.5): """ :param embed_dim: embedding size :param decoder_dim: size of decoder's RNN :param vocab_size: size of vocabulary :param encoder_dim: feature size of encoded images :param dropout: dropout """ super(DecoderWithAttention, self).__init__() self.encoder_dim = encoder_dim self.embed_dim = embed_dim self.decoder_dim = decoder_dim self.vocab_size = vocab_size self.dropout = dropout self.attention = Attention(encoder_dim, decoder_dim) # attention network self.embedding = nn.Embedding(vocab_size, embed_dim) # embedding layer self.dropout = nn.Dropout(p=self.dropout) self.decode_step = nn.LSTMCell(embed_dim + encoder_dim, decoder_dim, bias=True) # decoding LSTMCell self.init_h = nn.Linear( encoder_dim, decoder_dim ) # linear layer to find initial hidden state of LSTMCell self.init_c = nn.Linear( encoder_dim, decoder_dim) # linear layer to find initial cell state of LSTMCell self.f_beta = nn.Linear( decoder_dim, encoder_dim) # linear layer to create a sigmoid-activated gate self.sigmoid = nn.Sigmoid() self.fc = nn.Linear( decoder_dim, vocab_size) # linear layer to find scores over vocabulary self.init_weights()
def __init__(self, h_dim, c_num): super(AttnRegressor, self).__init__() self.attn = Attention(h_dim) self.main = nn.Linear(h_dim, c_num)
def get_model(self, embedded_sequences_1, embedded_sequences_2, sequences_1_length, sequences_2_length): model_layer1 = None if self.model_style == 'bi_lstm': print('using model bi_lstm!!!') model_layer1 = Bidirectional(LSTM(Application.model_params['num_nn'])) elif self.model_style == 'ap_bi_lstm': print('using model ap_bi_lstm!!!') model_layer1 = Bidirectional(LSTM(Application.model_params['num_nn'], return_sequences=True)) model_layer2 = Attention() x_1 = model_layer1(embedded_sequences_1) x_1 = model_layer2(x_1) y_1 = model_layer1(embedded_sequences_2) y_1 = model_layer2(y_1) return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])]) elif self.model_style == 'bi_gru': print('using model bi_gru!!!') model_layer1 = Bidirectional(GRU(Application.model_params['num_nn'])) elif self.model_style == 'ap_bi_gru': print('using model ap_bi_gru!!!') model_layer1 = Bidirectional(GRU(Application.model_params['num_nn'], return_sequences=True)) model_layer2 = Attention() x_1 = model_layer1(embedded_sequences_1) x_1 = model_layer2(x_1) y_1 = model_layer1(embedded_sequences_2) y_1 = model_layer2(y_1) return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])]) elif self.model_style == 'cnn': model_layer1 = Conv1D(Application.model_params['num_nn'], 4, padding='valid', activation='relu', strides=1) x_1 = model_layer1(embedded_sequences_1) y_1 = model_layer1(embedded_sequences_2) x_1 = GlobalMaxPooling1D()(x_1) y_1 = GlobalMaxPooling1D()(y_1) return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])]) elif self.model_style == 'ap_cnn': model_layer1 = Conv1D(Application.model_params['num_nn'], 2, padding='valid', activation='relu', strides=1) model_layer2 = Attention() x_1 = model_layer1(embedded_sequences_1) y_1 = model_layer1(embedded_sequences_2) x_1 = model_layer2(x_1) y_1 = model_layer2(y_1) return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])]) elif self.model_style == 'multi_attention': print('using model multi_attention!!!') x_1_1 = Dense(Application.model_params['num_nn'])(embedded_sequences_1) y_1_1 = Dense(Application.model_params['num_nn'])(embedded_sequences_2) x_2, y_2 = multi_head_self_attention(x_1_1, y_1_1) x_3, y_3 = multi_head_mutual_attention(x_1_1, y_1_1) # x_2 = GlobalMaxPooling1D()(x_2) # y_2 = GlobalMaxPooling1D()(y_2) # x_3 = GlobalMaxPooling1D()(x_3) # y_3 = GlobalMaxPooling1D()(y_3) # return concatenate( # [SubtractAbs()([x_2, y_2]), multiply([x_2, y_2]), SubtractAbs()([x_3, y_3]), multiply([x_3, y_3])]) z_2 = concatenate([x_2, y_2], axis=2) z_2 = GlobalMaxPooling1D()(z_2) z_3 = concatenate([x_3, y_3], axis=2) z_3 = GlobalMaxPooling1D()(z_3) return concatenate([z_2, z_3]) elif self.model_style == 'bi_gru_multi_attention': model_layer1 = Bidirectional(GRU(Application.model_params['num_nn'], return_sequences=True)) model_layer2 = MultiHeadAttention(Application.model_params['head'], int(Application.model_params['num_nn'] / Application.model_params[ 'head'])) x_1 = model_layer1(embedded_sequences_1) y_1 = model_layer1(embedded_sequences_2) x_2 = model_layer2([x_1, x_1, x_1]) y_2 = model_layer2([y_1, y_1, y_1]) x_3 = GlobalMaxPooling1D()(x_2) y_3 = GlobalMaxPooling1D()(y_2) return concatenate([SubtractAbs()([x_3, y_3]), multiply([x_3, y_3])]) else: print("did not find this style model") x_1 = model_layer1(embedded_sequences_1) y_1 = model_layer1(embedded_sequences_2) return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
def __init__(self, vocab_size, embedding_size, encoder_hidden, bidirectional, decoder_hidden, n_layers, dropout=None, attention_mode="general", input_feeding=False, normalize=False): """ :param vocab_size: size of decoder vocabulary :param embedding_size: dimension of word embedding :param bidirectional: whether to use bidirectional LSTM :param encoder_hidden: dimension of hidden state of encoder LSTM :param decoder_hidden: dimension of hidden state of decoder LSTM :param n_layers: number of layers of decoder LSTM network :param dropout: dropout rate between LSTM layers, this parameter will work when number of layers >= 1 :param attention_mode: attention_mode to choose(dot, general, or concat) :param input_feeding: whether to use input_feeding :param normalize: whether to normalize encoder_decoder attention over time steps, set this parameter True if you want to mitigate repetition """ super(CopyDecoder, self).__init__() self.pad_token = PAD self.vocab_size = vocab_size self.input_feeding = input_feeding self.embedding = nn.Embedding(vocab_size, embedding_size, self.pad_token) self.num_directions = 2 if bidirectional else 1 self.hidden_dim = decoder_hidden self.decoder_hidden = decoder_hidden self.rnn = nn.LSTM(input_size=embedding_size, hidden_size=decoder_hidden, num_layers=n_layers, bidirectional=bidirectional, batch_first=True) if n_layers > 1: assert (dropout is not None) self.rnn.dropout = dropout self.enc_dec_attn = Attention(encoder_hidden, decoder_hidden, method=attention_mode, scale=True, normalize=normalize) self.dec_self_attn = Attention(decoder_hidden, decoder_hidden, method="dot", scale=True, normalize=False) self.softmax = nn.LogSoftmax(dim=-1) self.tanh = nn.Tanh() self.dropout_layer = nn.Dropout(dropout) self.decoder2vocab = nn.Linear(decoder_hidden * 3, self.vocab_size) self.copy_switch = nn.Sequential(nn.Linear(self.hidden_dim * 3, 1), nn.Sigmoid()) if self.input_feeding: self.dec_input_bridge = nn.Linear( decoder_hidden * 2 + embedding_size, embedding_size)
class Decoder(nn.Module): def __init__(self, params, device): super(Decoder, self).__init__() self.device = device self.prenet = Prenet() self.attention_rnn = nn.LSTMCell(256 + 512, 1024) self.attention_layer = Attention(1024, 512, 128, 32, 31) self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024) self.linear_projection = nn.Linear(1024 + 512, 80) self.gate_layer = nn.Linear(1024 + 512, 1) def decode(self, decoder_input): ''' Decoder main part for mel spectrogram's one frame :param decoder_input: previous mel output after prenet (B, 256) :return: decoder_output, gate_output, attention_weights ''' # concatenated prev mel and attention_context vector (B, 256 + 512) cell_input = torch.cat((decoder_input, self.attention_context), -1) # first LSTMCell with hidden_size 1024 self.attention_hidden, self.attention_cell = self.attention_rnn( cell_input, (self.attention_hidden, self.attention_cell)) # (B, 1024) self.attention_hidden = F.dropout(self.attention_hidden, 0.1) attention_weights_cat = torch.cat( (self.attention_weights.unsqueeze(1), self.attention_weights_cum.unsqueeze(1)), dim=1) self.attention_context, self.attention_weights = self.attention_layer( self.attention_hidden, self.memory, self.processed_memory, attention_weights_cat, self.mask) self.attention_weights_cum += self.attention_weights # (B, 1024 + 512) decoder_input = torch.cat( (self.attention_hidden, self.attention_context), -1) # Second LSTMCell with hidden_size 1024 self.decoder_hidden, self.decoder_cell = self.decoder_rnn( decoder_input, (self.decoder_hidden, self.decoder_cell)) # (B, 1024) self.decoder_hidden = F.dropout(self.decoder_hidden, 0.1) # (B, 1024 + 512) decoder_hidden_attention_context = torch.cat( (self.decoder_hidden, self.attention_context), dim=1) # linear layer for mel prediction (B, 80) decoder_output = self.linear_projection( decoder_hidden_attention_context) # binary classifier for stop token (B, 1) gate_prediction = torch.sigmoid( self.gate_layer(decoder_hidden_attention_context)) return decoder_output, gate_prediction, self.attention_weights def initialize_decoder_states(self, memory, mask): batch_size = memory.size(0) num_frames = memory.size(1) self.mask = mask self.memory = memory self.processed_memory = self.attention_layer.memory(memory) self.attention_context = torch.zeros((batch_size, 512)).to(self.device) self.attention_hidden = torch.zeros((batch_size, 1024)).to(self.device) self.attention_cell = torch.zeros((batch_size, 1024)).to(self.device) self.decoder_hidden = torch.zeros((batch_size, 1024)).to(self.device) self.decoder_cell = torch.zeros((batch_size, 1024)).to(self.device) self.attention_weights = torch.zeros( (batch_size, num_frames), requires_grad=True).to(self.device) self.attention_weights_cum = torch.zeros( (batch_size, num_frames)).to(self.device) def forward(self, memory, decoder_inputs, memory_lengths): """ :param memory: encoder outputs (B, T, 512) :param decoder_inputs: mel from previous step (B, num_mels, T) :param memory_lengths: (B, ) :return: mel_outputs, gate_outputs, alignments """ # start mel frame with zeros (1, B, num_mels) decoder_input = torch.zeros((1, memory.size(0), 80)).to(self.device) # (B, num_mels, T) -> (T, B, num_mels) decoder_inputs = decoder_inputs.permute(2, 0, 1) # (T + 1, B, num_mels) decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0) decoder_inputs = self.prenet(decoder_inputs) # (T, B, 256) self.initialize_decoder_states( memory, mask=~get_mask_from_lengths(memory_lengths, self.device)) mel_outputs, gate_outputs, alignments = [], [], [] # we don't need last frame for prediction for i in range(decoder_inputs.size(0) - 1): decoder_input = decoder_inputs[len(mel_outputs)] mel_output, gate_output, attention_weights = self.decode( decoder_input) mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output.squeeze()] alignments += [attention_weights] alignments = torch.stack(alignments).transpose(0, 1) gate_outputs = torch.stack(gate_outputs).transpose(0, 1) mel_outputs = torch.stack(mel_outputs).transpose(0, 1) mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, 80) # (B, T_out, num_mels) -> (B, num_mels, T_out) mel_outputs = mel_outputs.transpose(1, 2) return mel_outputs, gate_outputs, alignments def inference(self, memory): """ :param memory: encoder outputs (B, T, 512) :param memory_lengths: (B, ) :return: mel_outputs, gate_outputs, alignments """ decoder_input = torch.zeros( (1, memory.size(0), 80)).to(self.device).squeeze(0) self.initialize_decoder_states(memory, mask=torch.ones_like(memory)) mel_outputs, gate_outputs, alignments = [], [], [] # mean length of our mels is about 800-900 for i in range(1000): decoder_input = self.prenet(decoder_input) mel_output, gate_output, attention_weights = self.decode( decoder_input) mel_outputs += [mel_output.squeeze(1)] gate_outputs += [gate_output.squeeze(0)] alignments += [attention_weights] # more random numbers, it just works for a sigmoid if gate_output.item() > 0.6: break decoder_input = mel_output alignments = torch.stack(alignments).transpose(0, 1) gate_outputs = torch.stack(gate_outputs).transpose(0, 1) mel_outputs = torch.stack(mel_outputs).transpose(0, 1) mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, 80) # (B, T_out, num_mels) -> (B, num_mels, T_out) mel_outputs = mel_outputs.transpose(1, 2) return mel_outputs, gate_outputs, alignments