Example #1
0
    def decode(self, y_tm1, mask=None):
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"

        res = y_tm1
        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            self.dec_prenet_do[ii],
                            training=self.training)
        ### APPLY ADAPTER PRENET -> RNN ###
        res = self.dec_adapter_lyr.prenet_to_rnn(res)
        res = generator_act_fn(self.dec_prenet_fn)(res)
        res = F.dropout(res, self.dec_prenet_do[-1], training=self.training)
        ###

        # compute decoder rnn #
        res_dec = self.dec_att_lyr(res, mask)
        res = res_dec['dec_output']

        ### APPLY ADAPTER RNN -> FIRST REG ###
        # res = self.dec_adapter_lyr.rnn_to_first_reg(res)
        # res = self.dec_att_lyr.ctx_proj_fn_act(res)
        # res = F.dropout(res, self.dec_att_lyr.do[-1], self.training)
        ###
        res_first = self.dec_first_reg_lyr(res)
        ### APPLY ADAPTER AFTER FIRST_REG ###
        res_first = self.dec_adapter_lyr.first_reg(res_first)
        ###

        # compute sigmoid layer #
        res_bern_end = self.dec_bernoulli_end_lyr(
            torch.cat([res_first, res_dec['dec_output']], 1))
        return res_first, res_dec, res_bern_end
Example #2
0
    def decode(self, y_tm1, mask=None):
        """
        Return:
            res_first : core (Mel) prediction
            res_dec : decoder RNN Attention result
            res_bern_end : final frame prediction
        """
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"
        assert self.speaker_vector is not None, "set speaker vector into with method set_aux_info"
        assert self.speaker_vector.shape[0] == y_tm1.shape[
            0] == self.ctx.shape[0], "batch size is different"

        res = y_tm1

        # OPTIONAL #
        res = self.mask_dec_feat(res)

        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            if ii == len(self.dec_prenet_lyr) - 1:  # last prenet layer
                res_spk = self.speaker_module_lyr.dec_proj_prenet_lyr(
                    self.speaker_vector)
                res_spk = generator_act_fn(self.speaker_integrate_fn)(res_spk)
                res += res_spk
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            self.dec_prenet_do[ii],
                            training=self.training)

        # compute decoder rnn #
        res_dec = self.dec_att_lyr(res, mask)
        res = res_dec['dec_output']
        # generate mel-spec prediction #
        res_first = res

        for ii in range(len(self.dec_core_gen_lyr)):
            res_first = self.dec_core_gen_lyr[ii](res_first)

            if ii == 0:
                # integrate speaker info #
                res_spk = self.speaker_module_lyr.dec_proj_core_gen_lyr(
                    self.speaker_vector)
                res_spk = generator_act_fn(self.speaker_integrate_fn)(res_spk)
                res_first = res_first + res_spk

            if ii != len(self.dec_core_gen_lyr
                         ) - 1:  # if not last layer, apply act_fn & dropout
                res_first = generator_act_fn(self.dec_core_gen_fn)(res_first)
                res_first = F.dropout(res_first,
                                      self.dec_core_gen_do[ii],
                                      training=self.training)

        # predict frame ending #
        res_bern_end = self.dec_bernoulli_end_lyr(
            torch.cat([res_first, res_dec['dec_output']],
                      1).detach())  # stop gradient produce better result
        return res_first, res_dec, res_bern_end
Example #3
0
    def encode(self, input, src_len=None):
        """
        input : (batch x max_src_len)
        mask : (batch x max_src_len)
        """
        batch, max_src_len = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch
        res = self.enc_emb_lyr(input)  # batch x max_src_len x emb_dim #
        res = F.dropout(res, self.enc_emb_do, training=self.training)
        res = res.view(batch * max_src_len, -1)
        for ii in range(len(self.enc_prenet_lyr)):
            res = self.enc_prenet_lyr[ii](res)
            res = generator_act_fn(self.enc_prenet_fn)(res)
            res = F.dropout(res,
                            p=self.enc_prenet_do[ii],
                            training=self.training)
        res = res.view(batch, max_src_len, -1)
        res = self.enc_core_lyr(res, src_len)

        ctx = res

        if src_len is not None:
            ctx_mask = Variable(
                generate_seq_mask(src_len, self, max_len=ctx.size(1)))
        else:
            ctx_mask = None

        self.ctx = ctx
        self.ctx_mask = ctx_mask
        self.src_len = src_len

        self.dec_att_lyr.set_ctx(ctx, ctx_mask)
Example #4
0
    def encode_raw(self, input, src_len=None):
        batch, max_src_len, in_size = input.size()

        if src_len is None:
            src_len = [max_src_len] * batch
        res = input.view(batch * max_src_len, 1, in_size, 1)
        enc_cnn_act = generator_act_fn(self.enc_cnn_act)
        # apply conv #
        for ii in range(len(self.enc_cnn)):
            res = F.dropout(enc_cnn_act(self.enc_cnn[ii](res)),
                            self.enc_cnn_do[ii], self.training)
            if self.use_bn:
                res = self.enc_cnn_bn[ii](res)
                pass
            pass

        # apply NiN #
        for ii in range(len(self.enc_nin)):
            res = enc_cnn_act(self.enc_nin[ii](res))

        final_h, final_w = res.size()[2:]
        res = F.avg_pool2d(
            res, (final_h, final_w))  # (batch * seq_len) x ch x 1 x 1 #
        res = res.unsqueeze(2).unsqueeze(2)  # (batch * seq_len) x ch #
        res = res.view(batch, max_src_len, -1)
        return res
        pass
Example #5
0
    def forward(self, x, src_len=None):
        """
        x : (batch x seq x ndim) 
        src_len : (batch)
        """
        batchsize, seqlen, ndim = x.size()
        if src_len is None:
            src_len = [seqlen] * batchsize

        ### FNN ###
        # convert shape for FNN #
        res = x.contiguous().view(seqlen * batchsize, ndim)

        for ii in range(len(self.fnn_sizes)):
            res = generator_act_fn(self.fnn_act)(self.fnn_lyr[ii](res))
            res = F.dropout(res, self.fnn_do, training=self.training)

        ### RNN ###
        # convert shape for RNN #
        res = res.view(batchsize, seqlen, -1)
        if self.use_pack:
            res = pack(res, src_len, batch_first=True)
        res = self.rnn_lyr(res)[0]
        if self.use_pack:
            res, _ = unpack(res, batch_first=True)

        ### PRE SOFTMAX ###
        batchsize, seqlen_final, ndim_final = res.size()
        res = res.contiguous().view(seqlen_final * batchsize, ndim_final)

        res = self.pre_softmax(res)
        res = res.view(batchsize, seqlen_final, -1)
        res = res.transpose(1, 0)
        return res, Variable(torch.IntTensor(src_len))
Example #6
0
    def forward(self, input, seq_len=None):
        if seq_len is not None:
            mask_input = Variable(
                generate_seq_mask(
                    seq_len=seq_len,
                    device=self).unsqueeze(-1))  # batch x seq_len x 1 #
            mask_input_conv = mask_input.transpose(1, 2)  # batch x 1 x seq_len
        else:
            mask_input = None

        if mask_input is not None:
            input = input * mask_input

        res = input
        res = res.transpose(1, 2)
        for ii in range(len(self.conv_bank_lyrs)):
            res = self.conv_bank_lyrs[ii](res)
            res = generator_act_fn(self.conv_fn_act)(res)
            if self.conv_do[ii] > 0.0:
                res = F.dropout(res,
                                p=self.conv_do[ii],
                                training=self.training)
            if mask_input is not None:
                res = res * mask_input_conv
        res = res.transpose(1, 2)  # batch x seq_len x ndim
        # apply linear layer #
        res = self.lin_pred_lyr(res)
        if mask_input is not None:
            res = res * mask_input
        return res
Example #7
0
    def decode(self, y_tm1, mask=None):
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"

        res = y_tm1

        # OPTIONAL #
        res = self.mask_dec_feat(res)

        # apply dec prenet lyr #
        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            p=self.dec_prenet_do[ii],
                            training=self.training)

        # apply dec att lyr #
        res_dec_att = self.dec_att_lyr(res, mask)
        res = res_dec_att['dec_output']

        # apply lin proj lyr #
        for ii in range(len(self.dec_proj_lyr)):
            if ii != len(self.dec_proj_lyr) - 1:
                res = self.dec_proj_lyr[ii](res)
                res = generator_act_fn(self.dec_proj_fn)(res)
                res = F.dropout(res,
                                p=self.dec_proj_do[ii],
                                training=self.training)
            else:
                res = self.dec_proj_lyr[ii](res)

        # predict frame stopping #
        # input = spec + dec_att_output + att_context
        _bern_end_input = torch.cat([
            res, res_dec_att['dec_output'],
            res_dec_att['att_output']['expected_ctx']
        ],
                                    dim=1)
        res_bern_end = self.dec_bern_end_lyr(
            _bern_end_input.detach())  # stop gradient

        return res, res_dec_att, res_bern_end
Example #8
0
    def decode(self, y_tm1, mask=None):
        """
        Return:
            res_first : core (Mel) prediction
            res_dec : decoder RNN Attention result
            res_bern_end : final frame prediction
        """
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"

        res = y_tm1

        # OPTIONAL #
        res = self.mask_dec_feat(res)

        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            self.dec_prenet_do[ii],
                            training=self.training)

        # compute decoder rnn #
        res_dec = self.dec_att_lyr(res, mask)
        res = res_dec['dec_output']
        # generate mel-spec prediction #
        res_first = res
        for ii in range(len(self.dec_core_gen_lyr)):
            res_first = self.dec_core_gen_lyr[ii](res_first)
            if ii != len(self.dec_core_gen_lyr
                         ) - 1:  # if not last layer, apply act_fn & dropout
                res_first = generator_act_fn(self.dec_core_gen_fn)(res_first)
                res_first = F.dropout(res_first,
                                      self.dec_core_gen_do[ii],
                                      training=self.training)
        res_bern_end = self.dec_bernoulli_end_lyr(
            torch.cat([res_first, res_dec['dec_output']],
                      1).detach())  # stop gradient produce better result
        return res_first, res_dec, res_bern_end
Example #9
0
    def decode(self, y_tm1, mask=None):
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"
        batch = y_tm1.size(0)
        res = y_tm1

        # OPTIONAL #
        res = self.mask_dec_feat(res)

        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            if ii == len(self.dec_prenet_lyr) - 1:  # last layer #
                # concat speaker info #
                res_spk = self.spk_module_lyr.dec_lin_prenet_lyr(
                    self.input_spk_emb)
                res_spk = generator_act_fn(self.speaker_act_fn)(res_spk)
                res += res_spk
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            self.dec_prenet_do[ii],
                            training=self.training)

        # compute decoder rnn #
        res_dec = self.dec_att_lyr(res, mask)
        res = res_dec['dec_output']

        # concat speaker info #
        res_spk = self.spk_module_lyr.dec_lin_pre_reg_first_lyr(
            self.input_spk_emb)
        res_spk = generator_act_fn(self.speaker_act_fn)(res_spk)
        res = res + res_spk

        res_first = self.dec_first_reg_lyr(res)

        res_bern_end = self.dec_bernoulli_end_lyr(
            torch.cat([res_first, res_dec['dec_output']], 1))
        return res_first, res_dec, res_bern_end
Example #10
0
    def encode(self, input, input_aux, src_len=None):
        """
        input : feat matrix
        input_aux : map contains additional info speaker embedding ID
        """
        batch, max_src_len = input.size()
        self.input_spk_emb = self.get_speaker_emb(input_aux['spk'])
        assert self.input_spk_emb.size(0) == batch

        if src_len is None:
            src_len = [max_src_len] * batch
        res = self.enc_emb_lyr(input)  # batch x max_src_len x emb_dim #
        res = F.dropout(res, self.enc_emb_do, self.training)
        res = res.view(batch * max_src_len, -1)
        for ii in range(len(self.enc_prenet_lyr)):
            res = self.enc_prenet_lyr[ii](res)
            res = generator_act_fn(self.enc_prenet_fn)(res)
            res = F.dropout(res,
                            p=self.enc_prenet_do[ii],
                            training=self.training)
        res = res.view(batch, max_src_len, -1)

        ### SPK ###
        # res_spk = self.spk_enc_lin_prenet_lyr(input_spk_emb).unsqueeze(1).expand_as(
        # batch, max_src_len, self.spk_emb_lyr.embedding_dim)
        # res_spk = self.spk_act_fn(res_spk)
        # res = res + res_spk
        ###########

        res = self.enc_core_lyr(res, src_len)

        ctx = res

        if src_len is not None:
            ctx_mask = Variable(
                generate_seq_mask(src_len, self, max_len=ctx.size(1)))
        else:
            ctx_mask = None

        self.ctx = ctx
        self.ctx_mask = ctx_mask
        self.src_len = src_len

        self.dec_att_lyr.set_ctx(ctx, ctx_mask)
        pass
Example #11
0
    def decode(self, y_tm1, mask=None):
        assert y_tm1.dim() == 2, "batchsize x dec_in_size ( 1 timestep only)"

        res = y_tm1

        # OPTIONAL #
        res = self.mask_dec_feat(res)

        for ii in range(len(self.dec_prenet_lyr)):
            res = self.dec_prenet_lyr[ii](res)
            res = generator_act_fn(self.dec_prenet_fn)(res)
            res = F.dropout(res,
                            self.dec_prenet_do[ii],
                            training=self.training)

        # compute decoder rnn #
        res_dec = self.dec_att_lyr(res, mask)
        res = res_dec['dec_output']
        res_first = self.dec_first_reg_lyr(res)
        return res_first, res_dec
Example #12
0
    def forward(self, input, input_len=None):
        batch, max_input_len, in_size = input.size()
        # convert to batch, channel, seq_len, n_dim

        # apply masking #
        if input_len is not None:
            mask_input = Variable(
                generate_seq_mask(input_len,
                                  device=self,
                                  max_len=max_input_len).unsqueeze(-1))
            input = input * mask_input

        res = input.unsqueeze(1)

        # apply conv
        for ii in range(self.num_layers):
            res = self.conv_lyr[ii](res)
            res = generator_act_fn(self.conv_fn_act)(res)
            res = self.resblock_lyr[ii](res)

        # res = [batch, out_channel, seq_len, n_dim] #
        # pool across seq_len #
        if self.pool_fn == 'avg':
            res = F.avg_pool2d(res, kernel_size=[res.size(2), 1], stride=1)
        elif self.pool_fn == 'max':
            res = F.max_pool2d(res, kernel_size=[res.size(2), 1], stride=1)
        else:
            raise ValueError("pool_fn {} is not implemented".format(
                self.pool_fn))

        # affine transform #
        # res = [batch, out_channel, 1, n_dim] #
        res = F.avg_pool2d(res, kernel_size=[1, res.size(-1)], stride=1)
        # res = [batch, out_channel, 1, 1] #
        res = res.squeeze(-1).squeeze(-1)  # res = [batch, out_channel]
        res = self.lin_emb_lyr(res)
        # normalize to unit-norm #
        res = res / torch.norm(res, p=2, dim=1, keepdim=True)
        return res