Example #1
0
 def __init__(self, use_postnet=True, n_spkers=1):
     super(FastSpeech2, self).__init__()
     
     ### Speaker Embedding Table ###
     self.use_spk_embed = hp.use_spk_embed
     if self.use_spk_embed:
         self.n_spkers = n_spkers
         self.spk_embed_dim = hp.spk_embed_dim
         self.spk_embed_weight_std = hp.spk_embed_weight_std
         self.embed_speakers = Embedding(n_spkers, self.spk_embed_dim, padding_idx=None, std=self.spk_embed_weight_std)
         
     self.use_emo_embed = hp.use_emo_embed
     if self.use_emo_embed:
         self.n_emotes = n_emotes
         self.emo_embed_dim = hp.emo_embed_dim
         self.emo_embed_weight_std = hp.emo_embed_weight_std
         self.embed_emotions = Embedding(n_emotes, self.emo_embed_dim, padding_idx=None, std=self.emo_embed_weight_std)
     
     ### Encoder, Speaker Integrator, Variance Adaptor, Deocder, Postnet ###
     self.encoder = Encoder()
     if self.use_spk_embed:
         self.speaker_integrator = SpeakerIntegrator()
     self.variance_adaptor = VarianceAdaptor()
     self.decoder = Decoder()
     self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels)
     self.use_postnet = use_postnet
     if self.use_postnet:
         self.postnet = PostNet()
    def __init__(self, opt):
        self.opt = opt
        self.tt = torch.cuda if opt.cuda else torch

        checkpoint = torch.load(opt.model)
        model_opt = checkpoint['settings']
        self.model_opt = model_opt

        model = Decoder(model_opt.user_size,
                        d_k=model_opt.d_k,
                        d_v=model_opt.d_v,
                        d_model=model_opt.d_model,
                        d_word_vec=model_opt.d_word_vec,
                        kernel_size=model_opt.window_size,
                        finit=model_opt.finit,
                        d_inner_hid=model_opt.d_inner_hid,
                        n_head=model_opt.n_head,
                        dropout=model_opt.dropout)

        prob_projection = nn.Softmax()

        model.load_state_dict(checkpoint['model'])
        print('[Info] Trained model state loaded.')

        if opt.cuda:
            model.cuda()
            prob_projection.cuda()
        else:
            model.cpu()
            prob_projection.cpu()

        model.prob_projection = prob_projection

        self.model = model
        self.model.eval()
Example #3
0
    def __init__(self):
        super(FastSpeech2, self).__init__()

        self.encoder = Encoder()
        self.variance_adaptor = VarianceAdaptor()
        self.decoder = Decoder()

        self.mel_linear = Linear(hp.decoder_hidden, hp.n_mel_channels)
        self.postnet = PostNet()
Example #4
0
    def __init__(self):
        super(FastSpeech, self).__init__()

        self.encoder = Encoder()
        self.length_regulator = LengthRegulator()
        self.decoder = Decoder()

        self.mel_linear = Linear(hp.decoder_output_size, hp.num_mels)
        self.postnet = PostNet()
Example #5
0
    def __init__(self):
        super(FastSpeech, self).__init__()

        self.encoder = Encoder()
        self.length_regulator = LengthRegulator()
        self.decoder = Decoder()

        self.mel_linear = Linear(hp.decoder_dim, hp.num_mels)
        self.postnet = CBHG(hp.num_mels, K=8, projections=[256, hp.num_mels])
        self.last_linear = Linear(hp.num_mels * 2, hp.num_mels)
Example #6
0
    def __init__(self, use_postnet=True):
        super(STYLER, self).__init__()

        self.style_modeling = StyleModeling()

        self.decoder = Decoder()
        self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels)

        self.use_postnet = use_postnet
        if self.use_postnet:
            self.postnet = PostNet()
        encoder_output = None
    def __init__(self, use_postnet=True):
        super(FastSpeech2, self).__init__()

        self.encoder = Encoder()
        self.variance_adaptor = TacotronDuration()

        self.decoder = Decoder()
        self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels)

        self.use_postnet = use_postnet
        if self.use_postnet:
            self.postnet = PostNet()
Example #8
0
    def __init__(self,  py_vocab_size,hz_vocab_size=None, use_postnet=True):
        super(FastSpeech2, self).__init__()

        self.encoder = Encoder(py_vocab_size, hz_vocab_size = hz_vocab_size)
        self.variance_adaptor = VarianceAdaptor()

        self.decoder = Decoder()
        self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels)

        self.use_postnet = use_postnet
        if self.use_postnet:
            self.postnet = UNet(scale=8)
Example #9
0
    def __init__(self, n_layers=hp.variance_predictor_layer):
        super(VariancePredictor, self).__init__()

        self.decoder = Decoder(
            len_max_seq=hp.max_seq_len,
            d_word_vec=hp.variance_predictor_hidden,
            n_layers=n_layers,
            n_head=hp.variance_predictor_head,
            d_k=hp.variance_predictor_hidden // hp.variance_predictor_head,
            d_v=hp.variance_predictor_hidden // hp.variance_predictor_head,
            d_model=hp.variance_predictor_hidden,
            d_inner=hp.fft_conv1d_filter_size,
            dropout=hp.variance_predictor_dropout)
        self.linear_layer = nn.Linear(hp.variance_predictor_hidden, 1)
Example #10
0
    def __init__(self):
        super(LengthPredictor, self).__init__()

        self.decoder = Decoder(
            len_max_seq=hp.max_seq_len,
            d_word_vec=hp.length_predictor_hidden,
            n_layers=hp.length_predictor_layer,
            n_head=hp.length_predictor_head,
            d_k=hp.length_predictor_hidden // hp.length_predictor_head,
            d_v=hp.length_predictor_hidden // hp.length_predictor_head,
            d_model=hp.length_predictor_hidden,
            d_inner=hp.fft_conv1d_filter_size,
            dropout=hp.length_predictor_dropout)
        self.linear_layer = nn.Linear(hp.length_predictor_hidden, 1)
        self.tanh = nn.Tanh()
Example #11
0
    def __init__(self, use_postnet=True):
        super(FastSpeech2, self).__init__()
        
#         self.gst = GST()
        self.encoder = Encoder()
        self.variance_adaptor = VarianceAdaptor()

        self.decoder = Decoder()
        
        if hp.vocoder=='WORLD':
#             self.f0_decoder= Decoder()
            self.ap_linear = nn.Linear(hp.decoder_hidden, hp.n_ap_channels)
            self.sp_linear = nn.Linear(hp.decoder_hidden, hp.n_sp_channels)
        else:
            self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels)
        
        self.use_postnet = use_postnet
        if self.use_postnet:
            self.postnet = PostNet()
Example #12
0
    def __init__(self,
                 src_vocab_size,
                 tgt_vocab_size,
                 n_layer=6,
                 d_model=512,
                 d_ff=2048,
                 n_head=8,
                 dropout=0.1):
        super(Transformer, self).__init__()
        self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab_size),
                                       PositionalEncoding(d_model, dropout))
        self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab_size),
                                       PositionalEncoding(d_model, dropout))
        self.encoder = Encoder(n_head, d_model, d_ff, dropout, n_layer)
        self.decoder = Decoder(n_head, d_model, d_ff, dropout, n_layer)
        self.generator = Generator(d_model, tgt_vocab_size)

        # Initialize parameters with Glorot / fan_avg.
        for p in self.parameters():
            if p.dim() > 1:
                nn.init.xavier_uniform_(p)
Example #13
0
    def __init__(self,
                 bert_model_path,
                 n_tgt_vocab,
                 len_max_seq,
                 d_word_vec=768,
                 d_model=768,
                 d_inner=3072,
                 n_layers=12,
                 n_head=12,
                 d_k=64,
                 d_v=64,
                 dropout=0.1):

        super().__init__()

        self.bert = BertModel.from_pretrained(bert_model_path)
        self.config = BertConfig(bert_model_path + 'bert_config.json')
        self.decoder = Decoder(n_tgt_vocab=n_tgt_vocab,
                               len_max_seq=len_max_seq,
                               d_word_vec=d_word_vec,
                               d_model=d_model,
                               d_inner=d_inner,
                               n_layers=n_layers,
                               n_head=n_head,
                               d_k=d_k,
                               d_v=d_v,
                               dropout=dropout)
        self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False)
        nn.init.xavier_normal_(self.tgt_word_prj.weight)
        self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight
        self.x_logit_scale = (d_model**-0.5)
        self.o_l = nn.Linear(d_model, 512, bias=False)
        self.h_l = nn.Linear(512, 1, bias=True)
        nn.init.xavier_normal_(self.o_l.weight)
        nn.init.xavier_normal_(self.h_l.weight)
        self.a_l_1 = nn.Linear(d_model, 512, bias=False)
        self.a_l_2 = nn.Linear(d_model, 512, bias=False)
        nn.init.xavier_normal_(self.a_l_1.weight)
        nn.init.xavier_normal_(self.a_l_2.weight)
Example #14
0
    def __init__(self,n_layers):
        super(VariancePredictor, self).__init__()

        self.decoder = Decoder(n_layers)
        self.linear_layer = nn.Linear(hp.decoder_hidden, 1)
Example #15
0
    def __init__(self,n_layers):
        super(LengthPredictor, self).__init__()

        self.decoder = Decoder(n_layers)
        self.linear_layer = nn.Linear(hp.decoder_hidden, 1)
        self.tanh=nn.Tanh()
Example #16
0
def main():
    ''' Main function'''
    parser = argparse.ArgumentParser()

    parser.add_argument('-epoch', type=int, default=20)
    parser.add_argument('-batch_size', type=int, default=32)

    parser.add_argument('-d_model', type=int, default=64)
    parser.add_argument('-d_inner_hid', type=int, default=64)
    parser.add_argument('-d_k', type=int, default=64)
    parser.add_argument('-d_v', type=int, default=64)
    parser.add_argument('-window_size', type=int, default=3)
    parser.add_argument('-finit', type=int, default=0)

    parser.add_argument('-n_head', type=int, default=8)
    parser.add_argument('-n_warmup_steps', type=int, default=1000)

    parser.add_argument('-dropout', type=float, default=0.1)
    parser.add_argument('-embs_share_weight', action='store_true')
    parser.add_argument('-proj_share_weight', action='store_true')

    parser.add_argument('-save_model', default='Lastfm_test')
    parser.add_argument('-no_cuda', action='store_true')

    opt = parser.parse_args()
    opt.cuda = not opt.no_cuda
    opt.d_word_vec = opt.d_model

    #========= Preparing DataLoader =========#
    train_data = DataLoader(use_valid=False,
                            load_dict=True,
                            batch_size=opt.batch_size,
                            cuda=opt.cuda)
    opt.user_size = train_data.user_size

    #========= Preparing Model =========#

    decoder = Decoder(opt.user_size,
                      d_k=opt.d_k,
                      d_v=opt.d_v,
                      d_model=opt.d_model,
                      d_word_vec=opt.d_word_vec,
                      d_inner_hid=opt.d_inner_hid,
                      n_head=opt.n_head,
                      kernel_size=opt.window_size,
                      dropout=opt.dropout)

    optimizer = ScheduledOptim(
        optim.Adam(decoder.parameters(), betas=(0.9, 0.98), eps=1e-09),
        opt.d_model, opt.n_warmup_steps)

    def get_criterion(user_size):
        ''' With PAD token zero weight '''
        weight = torch.ones(user_size)
        weight[Constants.PAD] = 0
        weight[Constants.EOS] = 0
        return nn.CrossEntropyLoss(weight, size_average=False)

    crit = get_criterion(train_data.user_size)

    if opt.cuda:
        decoder = decoder.cuda()
        crit = crit.cuda()

    train(decoder, train_data, crit, optimizer, opt)
Example #17
0
        else:
            length_regulator_output, decoder_pos = self.length_regulator(
                encoder_output, encoder_mask, alpha=alpha)

            decoder_output = self.decoder(length_regulator_output, decoder_pos)

            mel_output = self.mel_linear(decoder_output)
            mel_output_postnet = self.postnet(mel_output) + mel_output

            return mel_output, mel_output_postnet


if __name__ == "__main__":
    # Test
    test_encoder = Encoder()
    test_decoder = Decoder()
    # print(test_encoder)
    # print(test_decoder)

    test_src = torch.stack([
        torch.Tensor([1, 2, 4, 3, 2, 5, 0, 0]),
        torch.Tensor([3, 4, 2, 6, 7, 1, 2, 3])
    ]).long()
    test_pos = torch.stack([
        torch.Tensor([1, 2, 3, 4, 5, 6, 0, 0]),
        torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8])
    ]).long()
    test_target = torch.stack([
        torch.Tensor([0, 2, 3, 0, 3, 2, 1, 0]),
        torch.Tensor([1, 2, 3, 2, 2, 0, 3, 6])
    ])