Beispiel #1
0
class GeneratorN(nn.Module):
    def __init__(self, use_self_attention=False):
        super().__init__()
        self.residuals = nn.Sequential(*[Residual(D_GF * 2) for _ in range(RESIDUALS)])
        self.attn = Attention(D_GF, D_HIDDEN)
        self.upsample = upsample_block(D_GF * 2, D_GF)
        self.use_self_attention = use_self_attention

        if self.use_self_attention:
            self.self_attn = self_attn_block()

        p_trainable, p_non_trainable = count_params(self)
        print(f'GeneratorN params: trainable {p_trainable} - non_trainable {p_non_trainable}')

    def forward(self, h_code, c_code, word_embs, mask):
        """
            h_code1(query), output of previous generator:  batch x D_GF x ih x iw (queryL=ihxiw)
            word_embs(context): batch x D_COND x seq_len
            c_code1: batch x D_GF x ih x iw
            att1: batch x sourceL x ih x iw
        """
        self.attn.applyMask(mask)
        c_code, att = self.attn(h_code, word_embs)
        # Image-text attention first, image-image attention second
        if self.use_self_attention:
            c_code = self.self_attn(c_code)

        out_code = torch.cat((h_code, c_code), 1)
        out_code = self.residuals(out_code)
        out_code = self.upsample(out_code)  # D_GF/2 x 2ih x 2iw

        return out_code, att
Beispiel #2
0
    def __init__(self, vocab_size, pos_size, word_embeddings=None):
        super(CorefTagger, self).__init__()
        self.vocab_size = vocab_size
        self.pos_size = pos_size

        self.WordEmbedding = nn.Embedding(self.vocab_size + 1, EMBEDDING_DIM)
        if word_embeddings is not None:
            self.WordEmbedding.weight = nn.Parameter(
                torch.from_numpy(word_embeddings).type(torch.cuda.FloatTensor))
        # print("word embedding size:", self.WordEmbedding.weight.size())
        self.WordLSTM = nn.LSTM(EMBEDDING_DIM,
                                256,
                                num_layers=1,
                                batch_first=True,
                                bidirectional=True)
        self.Attention = Attention(256 * 2)

        self.PosEmbedding = nn.Embedding(self.pos_size + 1, self.pos_size + 1)
        self.PosEmbedding.weight = nn.Parameter(
            torch.eye(self.pos_size + 1).type(torch.cuda.FloatTensor))
        self.PosLSTM = nn.LSTM(self.pos_size + 1,
                               16,
                               num_layers=1,
                               batch_first=True,
                               bidirectional=True)
        self.AttentionLSTM = Attention(16 * 2)

        self.PairHidden_1 = nn.Linear(2 * (512 + 32) + 2 + 1, 256)
        self.PairHidden_2 = nn.Linear(256, 128)
        self.Context = nn.Linear(128, 128)
        self.Decoder = nn.Linear(256, 64)
        # self.Harmonize = nn.Linear(64 * 3, 8)
        self.Out = nn.Linear(64 * 2, 2)

        self.optimizer = optim.SGD(self.parameters(), lr=0.01, weight_decay=0)
Beispiel #3
0
 def __init__(self, params, device):
     super(Decoder, self).__init__()
     self.device = device
     self.prenet = Prenet()
     self.attention_rnn = nn.LSTMCell(256 + 512, 1024)
     self.attention_layer = Attention(1024, 512, 128, 32, 31)
     self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024)
     self.linear_projection = nn.Linear(1024 + 512, 80)
     self.gate_layer = nn.Linear(1024 + 512, 1)
Beispiel #4
0
    def __init__(self, use_self_attention=False):
        super().__init__()
        self.residuals = nn.Sequential(*[Residual(D_GF * 2) for _ in range(RESIDUALS)])
        self.attn = Attention(D_GF, D_HIDDEN)
        self.upsample = upsample_block(D_GF * 2, D_GF)
        self.use_self_attention = use_self_attention

        if self.use_self_attention:
            self.self_attn = self_attn_block()

        p_trainable, p_non_trainable = count_params(self)
        print(f'GeneratorN params: trainable {p_trainable} - non_trainable {p_non_trainable}')
Beispiel #5
0
    def __init__(self, name, data, path=None):
        
        self.name = name 
        self.data = data

        n_voc = data.n_voc
        n_lab = data.n_lab
        ndim = data.ndim
        
        #build model
        x = T.imatrix('x')
        m = T.fmatrix('mask')
        y = T.ivector('y')
        is_train = T.iscalar('train_flag')

        self.layers = []
        self.layers.append(Embedding('embedding', x, n_voc, ndim, path))
        self.layers.append(LSTM('lstm', self.layers[-1].output, m, ndim, ndim, ndim, path))
        self.layers.append(Attention('attention', self.layers[-1].output, T.mean(self.layers[-1].output, 0),  m, ndim, ndim, ndim, path))
        self.layers.append(Dense('full_connection', self.layers[-1].output, ndim, ndim, path))
        self.layers.append(Dropout('dropout', self.layers[-1].output, 0.5, is_train, path))
        self.layers.append(Dense('softmax', self.layers[-1].output, ndim, int(n_lab), path, activation=T.nnet.softmax))
        
        #define cost function
        self.cost = -T.mean(T.log(self.layers[-1].output)[T.arange(y.shape[0]), y], acc_dtype='float32')
        correct = T.sum(T.eq(T.argmax(self.layers[-1].output, axis=1), y), acc_dtype='int32')
        
        #get grads of params
        params = []
        for layer in self.layers:
            params += list(layer.params.values())
        gparams = T.grad(self.cost, wrt=params)
        updates = adagrad(params, gparams)

        #define training model and test model
        self.train_model = theano.function(
            inputs=[is_train, x, m, y],
            outputs=self.cost,
            updates=updates)

        self.acc_model = theano.function(
            inputs=[is_train, x, m, y],
            outputs=[correct])
        self.index = {}
        self.index['valid'] = 1
        self.index['test']= 1
        self.best_valid_acc = 0.0
        self.out_len = 0
    def __init__(self,
                 embed_dim,
                 decoder_dim,
                 vocab_size,
                 encoder_dim=2048,
                 dropout=0.5):
        """
        :param embed_dim: embedding size
        :param decoder_dim: size of decoder's RNN
        :param vocab_size: size of vocabulary
        :param encoder_dim: feature size of encoded images
        :param dropout: dropout
        """
        super(DecoderWithAttention, self).__init__()

        self.encoder_dim = encoder_dim
        self.embed_dim = embed_dim
        self.decoder_dim = decoder_dim
        self.vocab_size = vocab_size
        self.dropout = dropout

        self.attention = Attention(encoder_dim,
                                   decoder_dim)  # attention network

        self.embedding = nn.Embedding(vocab_size, embed_dim)  # embedding layer
        self.dropout = nn.Dropout(p=self.dropout)
        self.decode_step = nn.LSTMCell(embed_dim + encoder_dim,
                                       decoder_dim,
                                       bias=True)  # decoding LSTMCell
        self.init_h = nn.Linear(
            encoder_dim, decoder_dim
        )  # linear layer to find initial hidden state of LSTMCell
        self.init_c = nn.Linear(
            encoder_dim,
            decoder_dim)  # linear layer to find initial cell state of LSTMCell
        self.f_beta = nn.Linear(
            decoder_dim,
            encoder_dim)  # linear layer to create a sigmoid-activated gate
        self.sigmoid = nn.Sigmoid()
        self.fc = nn.Linear(
            decoder_dim,
            vocab_size)  # linear layer to find scores over vocabulary
        self.init_weights()
 def __init__(self, h_dim, c_num):
     super(AttnRegressor, self).__init__()
     self.attn = Attention(h_dim)
     self.main = nn.Linear(h_dim, c_num)
Beispiel #8
0
    def get_model(self, embedded_sequences_1, embedded_sequences_2, sequences_1_length, sequences_2_length):
        model_layer1 = None
        if self.model_style == 'bi_lstm':
            print('using model bi_lstm!!!')
            model_layer1 = Bidirectional(LSTM(Application.model_params['num_nn']))
        elif self.model_style == 'ap_bi_lstm':
            print('using model ap_bi_lstm!!!')
            model_layer1 = Bidirectional(LSTM(Application.model_params['num_nn'], return_sequences=True))
            model_layer2 = Attention()
            x_1 = model_layer1(embedded_sequences_1)
            x_1 = model_layer2(x_1)
            y_1 = model_layer1(embedded_sequences_2)
            y_1 = model_layer2(y_1)
            return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
        elif self.model_style == 'bi_gru':
            print('using model bi_gru!!!')
            model_layer1 = Bidirectional(GRU(Application.model_params['num_nn']))
        elif self.model_style == 'ap_bi_gru':
            print('using model ap_bi_gru!!!')
            model_layer1 = Bidirectional(GRU(Application.model_params['num_nn'], return_sequences=True))
            model_layer2 = Attention()
            x_1 = model_layer1(embedded_sequences_1)
            x_1 = model_layer2(x_1)
            y_1 = model_layer1(embedded_sequences_2)
            y_1 = model_layer2(y_1)
            return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
        elif self.model_style == 'cnn':
            model_layer1 = Conv1D(Application.model_params['num_nn'], 4,
                                  padding='valid', activation='relu', strides=1)
            x_1 = model_layer1(embedded_sequences_1)
            y_1 = model_layer1(embedded_sequences_2)

            x_1 = GlobalMaxPooling1D()(x_1)
            y_1 = GlobalMaxPooling1D()(y_1)

            return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
        elif self.model_style == 'ap_cnn':
            model_layer1 = Conv1D(Application.model_params['num_nn'], 2,
                                  padding='valid', activation='relu', strides=1)
            model_layer2 = Attention()
            x_1 = model_layer1(embedded_sequences_1)
            y_1 = model_layer1(embedded_sequences_2)
            x_1 = model_layer2(x_1)
            y_1 = model_layer2(y_1)
            return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
        elif self.model_style == 'multi_attention':
            print('using model multi_attention!!!')
            x_1_1 = Dense(Application.model_params['num_nn'])(embedded_sequences_1)
            y_1_1 = Dense(Application.model_params['num_nn'])(embedded_sequences_2)

            x_2, y_2 = multi_head_self_attention(x_1_1, y_1_1)
            x_3, y_3 = multi_head_mutual_attention(x_1_1, y_1_1)

            # x_2 = GlobalMaxPooling1D()(x_2)
            # y_2 = GlobalMaxPooling1D()(y_2)
            # x_3 = GlobalMaxPooling1D()(x_3)
            # y_3 = GlobalMaxPooling1D()(y_3)
            # return concatenate(
            #     [SubtractAbs()([x_2, y_2]), multiply([x_2, y_2]), SubtractAbs()([x_3, y_3]), multiply([x_3, y_3])])

            z_2 = concatenate([x_2, y_2], axis=2)
            z_2 = GlobalMaxPooling1D()(z_2)
            z_3 = concatenate([x_3, y_3], axis=2)
            z_3 = GlobalMaxPooling1D()(z_3)
            return concatenate([z_2, z_3])
        elif self.model_style == 'bi_gru_multi_attention':
            model_layer1 = Bidirectional(GRU(Application.model_params['num_nn'], return_sequences=True))
            model_layer2 = MultiHeadAttention(Application.model_params['head'],
                                              int(Application.model_params['num_nn'] / Application.model_params[
                                                  'head']))
            x_1 = model_layer1(embedded_sequences_1)
            y_1 = model_layer1(embedded_sequences_2)
            x_2 = model_layer2([x_1, x_1, x_1])
            y_2 = model_layer2([y_1, y_1, y_1])
            x_3 = GlobalMaxPooling1D()(x_2)
            y_3 = GlobalMaxPooling1D()(y_2)
            return concatenate([SubtractAbs()([x_3, y_3]), multiply([x_3, y_3])])
        else:
            print("did not find this style model")
        x_1 = model_layer1(embedded_sequences_1)
        y_1 = model_layer1(embedded_sequences_2)
        return concatenate([SubtractAbs()([x_1, y_1]), multiply([x_1, y_1])])
Beispiel #9
0
    def __init__(self,
                 vocab_size,
                 embedding_size,
                 encoder_hidden,
                 bidirectional,
                 decoder_hidden,
                 n_layers,
                 dropout=None,
                 attention_mode="general",
                 input_feeding=False,
                 normalize=False):
        """
        :param vocab_size: size of decoder vocabulary
        :param embedding_size: dimension of word embedding
        :param bidirectional: whether to use bidirectional LSTM
        :param encoder_hidden: dimension of hidden state of encoder LSTM
        :param decoder_hidden: dimension of hidden state of decoder LSTM
        :param n_layers: number of layers of decoder LSTM network
        :param dropout: dropout rate between LSTM layers, this parameter will
        work when number of layers >= 1
        :param attention_mode: attention_mode to choose(dot, general, or concat)
        :param input_feeding: whether to use input_feeding
        :param normalize: whether to normalize encoder_decoder attention over
        time steps, set this parameter True if you want to mitigate repetition
        """

        super(CopyDecoder, self).__init__()
        self.pad_token = PAD
        self.vocab_size = vocab_size
        self.input_feeding = input_feeding
        self.embedding = nn.Embedding(vocab_size, embedding_size,
                                      self.pad_token)

        self.num_directions = 2 if bidirectional else 1
        self.hidden_dim = decoder_hidden
        self.decoder_hidden = decoder_hidden
        self.rnn = nn.LSTM(input_size=embedding_size,
                           hidden_size=decoder_hidden,
                           num_layers=n_layers,
                           bidirectional=bidirectional,
                           batch_first=True)

        if n_layers > 1:
            assert (dropout is not None)
            self.rnn.dropout = dropout

        self.enc_dec_attn = Attention(encoder_hidden,
                                      decoder_hidden,
                                      method=attention_mode,
                                      scale=True,
                                      normalize=normalize)

        self.dec_self_attn = Attention(decoder_hidden,
                                       decoder_hidden,
                                       method="dot",
                                       scale=True,
                                       normalize=False)

        self.softmax = nn.LogSoftmax(dim=-1)
        self.tanh = nn.Tanh()
        self.dropout_layer = nn.Dropout(dropout)

        self.decoder2vocab = nn.Linear(decoder_hidden * 3, self.vocab_size)
        self.copy_switch = nn.Sequential(nn.Linear(self.hidden_dim * 3, 1),
                                         nn.Sigmoid())

        if self.input_feeding:
            self.dec_input_bridge = nn.Linear(
                decoder_hidden * 2 + embedding_size, embedding_size)
Beispiel #10
0
class Decoder(nn.Module):
    def __init__(self, params, device):
        super(Decoder, self).__init__()
        self.device = device
        self.prenet = Prenet()
        self.attention_rnn = nn.LSTMCell(256 + 512, 1024)
        self.attention_layer = Attention(1024, 512, 128, 32, 31)
        self.decoder_rnn = nn.LSTMCell(1024 + 512, 1024)
        self.linear_projection = nn.Linear(1024 + 512, 80)
        self.gate_layer = nn.Linear(1024 + 512, 1)

    def decode(self, decoder_input):
        '''
        Decoder main part for mel spectrogram's one frame
        :param decoder_input: previous mel output after prenet (B, 256)
        :return: decoder_output, gate_output, attention_weights
        '''

        # concatenated prev mel and attention_context vector (B, 256 + 512)
        cell_input = torch.cat((decoder_input, self.attention_context), -1)

        # first LSTMCell with hidden_size 1024
        self.attention_hidden, self.attention_cell = self.attention_rnn(
            cell_input, (self.attention_hidden, self.attention_cell))

        # (B, 1024)
        self.attention_hidden = F.dropout(self.attention_hidden, 0.1)

        attention_weights_cat = torch.cat(
            (self.attention_weights.unsqueeze(1),
             self.attention_weights_cum.unsqueeze(1)),
            dim=1)

        self.attention_context, self.attention_weights = self.attention_layer(
            self.attention_hidden, self.memory, self.processed_memory,
            attention_weights_cat, self.mask)
        self.attention_weights_cum += self.attention_weights

        # (B, 1024 + 512)
        decoder_input = torch.cat(
            (self.attention_hidden, self.attention_context), -1)

        # Second LSTMCell with hidden_size 1024
        self.decoder_hidden, self.decoder_cell = self.decoder_rnn(
            decoder_input, (self.decoder_hidden, self.decoder_cell))

        # (B, 1024)
        self.decoder_hidden = F.dropout(self.decoder_hidden, 0.1)

        # (B, 1024 + 512)
        decoder_hidden_attention_context = torch.cat(
            (self.decoder_hidden, self.attention_context), dim=1)

        # linear layer for mel prediction (B, 80)
        decoder_output = self.linear_projection(
            decoder_hidden_attention_context)

        # binary classifier for stop token (B, 1)
        gate_prediction = torch.sigmoid(
            self.gate_layer(decoder_hidden_attention_context))

        return decoder_output, gate_prediction, self.attention_weights

    def initialize_decoder_states(self, memory, mask):
        batch_size = memory.size(0)
        num_frames = memory.size(1)
        self.mask = mask
        self.memory = memory
        self.processed_memory = self.attention_layer.memory(memory)
        self.attention_context = torch.zeros((batch_size, 512)).to(self.device)
        self.attention_hidden = torch.zeros((batch_size, 1024)).to(self.device)
        self.attention_cell = torch.zeros((batch_size, 1024)).to(self.device)
        self.decoder_hidden = torch.zeros((batch_size, 1024)).to(self.device)
        self.decoder_cell = torch.zeros((batch_size, 1024)).to(self.device)
        self.attention_weights = torch.zeros(
            (batch_size, num_frames), requires_grad=True).to(self.device)
        self.attention_weights_cum = torch.zeros(
            (batch_size, num_frames)).to(self.device)

    def forward(self, memory, decoder_inputs, memory_lengths):
        """
        :param memory: encoder outputs (B, T, 512)
        :param decoder_inputs: mel from previous step (B, num_mels, T)
        :param memory_lengths: (B, )
        :return: mel_outputs, gate_outputs, alignments
        """
        # start mel frame with zeros (1, B, num_mels)
        decoder_input = torch.zeros((1, memory.size(0), 80)).to(self.device)
        # (B, num_mels, T) -> (T, B, num_mels)
        decoder_inputs = decoder_inputs.permute(2, 0, 1)
        # (T + 1, B, num_mels)
        decoder_inputs = torch.cat((decoder_input, decoder_inputs), dim=0)
        decoder_inputs = self.prenet(decoder_inputs)
        # (T, B, 256)
        self.initialize_decoder_states(
            memory, mask=~get_mask_from_lengths(memory_lengths, self.device))

        mel_outputs, gate_outputs, alignments = [], [], []

        # we don't need last frame for prediction
        for i in range(decoder_inputs.size(0) - 1):
            decoder_input = decoder_inputs[len(mel_outputs)]
            mel_output, gate_output, attention_weights = self.decode(
                decoder_input)
            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output.squeeze()]
            alignments += [attention_weights]
        alignments = torch.stack(alignments).transpose(0, 1)
        gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
        mel_outputs = torch.stack(mel_outputs).transpose(0, 1)
        mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, 80)
        # (B, T_out, num_mels) -> (B, num_mels, T_out)
        mel_outputs = mel_outputs.transpose(1, 2)
        return mel_outputs, gate_outputs, alignments

    def inference(self, memory):
        """
        :param memory: encoder outputs (B, T, 512)
        :param memory_lengths: (B, )
        :return: mel_outputs, gate_outputs, alignments
        """

        decoder_input = torch.zeros(
            (1, memory.size(0), 80)).to(self.device).squeeze(0)
        self.initialize_decoder_states(memory, mask=torch.ones_like(memory))

        mel_outputs, gate_outputs, alignments = [], [], []
        # mean length of our mels is about 800-900
        for i in range(1000):
            decoder_input = self.prenet(decoder_input)
            mel_output, gate_output, attention_weights = self.decode(
                decoder_input)
            mel_outputs += [mel_output.squeeze(1)]
            gate_outputs += [gate_output.squeeze(0)]
            alignments += [attention_weights]
            # more random numbers, it just works for a sigmoid
            if gate_output.item() > 0.6:
                break
            decoder_input = mel_output
        alignments = torch.stack(alignments).transpose(0, 1)
        gate_outputs = torch.stack(gate_outputs).transpose(0, 1)
        mel_outputs = torch.stack(mel_outputs).transpose(0, 1)
        mel_outputs = mel_outputs.view(mel_outputs.size(0), -1, 80)
        # (B, T_out, num_mels) -> (B, num_mels, T_out)
        mel_outputs = mel_outputs.transpose(1, 2)
        return mel_outputs, gate_outputs, alignments