def __init__(self, use_postnet=True, n_spkers=1): super(FastSpeech2, self).__init__() ### Speaker Embedding Table ### self.use_spk_embed = hp.use_spk_embed if self.use_spk_embed: self.n_spkers = n_spkers self.spk_embed_dim = hp.spk_embed_dim self.spk_embed_weight_std = hp.spk_embed_weight_std self.embed_speakers = Embedding(n_spkers, self.spk_embed_dim, padding_idx=None, std=self.spk_embed_weight_std) self.use_emo_embed = hp.use_emo_embed if self.use_emo_embed: self.n_emotes = n_emotes self.emo_embed_dim = hp.emo_embed_dim self.emo_embed_weight_std = hp.emo_embed_weight_std self.embed_emotions = Embedding(n_emotes, self.emo_embed_dim, padding_idx=None, std=self.emo_embed_weight_std) ### Encoder, Speaker Integrator, Variance Adaptor, Deocder, Postnet ### self.encoder = Encoder() if self.use_spk_embed: self.speaker_integrator = SpeakerIntegrator() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()
def __init__(self, opt): self.opt = opt self.tt = torch.cuda if opt.cuda else torch checkpoint = torch.load(opt.model) model_opt = checkpoint['settings'] self.model_opt = model_opt model = Decoder(model_opt.user_size, d_k=model_opt.d_k, d_v=model_opt.d_v, d_model=model_opt.d_model, d_word_vec=model_opt.d_word_vec, kernel_size=model_opt.window_size, finit=model_opt.finit, d_inner_hid=model_opt.d_inner_hid, n_head=model_opt.n_head, dropout=model_opt.dropout) prob_projection = nn.Softmax() model.load_state_dict(checkpoint['model']) print('[Info] Trained model state loaded.') if opt.cuda: model.cuda() prob_projection.cuda() else: model.cpu() prob_projection.cpu() model.prob_projection = prob_projection self.model = model self.model.eval()
def __init__(self): super(FastSpeech2, self).__init__() self.encoder = Encoder() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() self.mel_linear = Linear(hp.decoder_hidden, hp.n_mel_channels) self.postnet = PostNet()
def __init__(self): super(FastSpeech, self).__init__() self.encoder = Encoder() self.length_regulator = LengthRegulator() self.decoder = Decoder() self.mel_linear = Linear(hp.decoder_output_size, hp.num_mels) self.postnet = PostNet()
def __init__(self): super(FastSpeech, self).__init__() self.encoder = Encoder() self.length_regulator = LengthRegulator() self.decoder = Decoder() self.mel_linear = Linear(hp.decoder_dim, hp.num_mels) self.postnet = CBHG(hp.num_mels, K=8, projections=[256, hp.num_mels]) self.last_linear = Linear(hp.num_mels * 2, hp.num_mels)
def __init__(self, use_postnet=True): super(STYLER, self).__init__() self.style_modeling = StyleModeling() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet() encoder_output = None
def __init__(self, use_postnet=True): super(FastSpeech2, self).__init__() self.encoder = Encoder() self.variance_adaptor = TacotronDuration() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()
def __init__(self, py_vocab_size,hz_vocab_size=None, use_postnet=True): super(FastSpeech2, self).__init__() self.encoder = Encoder(py_vocab_size, hz_vocab_size = hz_vocab_size) self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = UNet(scale=8)
def __init__(self, n_layers=hp.variance_predictor_layer): super(VariancePredictor, self).__init__() self.decoder = Decoder( len_max_seq=hp.max_seq_len, d_word_vec=hp.variance_predictor_hidden, n_layers=n_layers, n_head=hp.variance_predictor_head, d_k=hp.variance_predictor_hidden // hp.variance_predictor_head, d_v=hp.variance_predictor_hidden // hp.variance_predictor_head, d_model=hp.variance_predictor_hidden, d_inner=hp.fft_conv1d_filter_size, dropout=hp.variance_predictor_dropout) self.linear_layer = nn.Linear(hp.variance_predictor_hidden, 1)
def __init__(self): super(LengthPredictor, self).__init__() self.decoder = Decoder( len_max_seq=hp.max_seq_len, d_word_vec=hp.length_predictor_hidden, n_layers=hp.length_predictor_layer, n_head=hp.length_predictor_head, d_k=hp.length_predictor_hidden // hp.length_predictor_head, d_v=hp.length_predictor_hidden // hp.length_predictor_head, d_model=hp.length_predictor_hidden, d_inner=hp.fft_conv1d_filter_size, dropout=hp.length_predictor_dropout) self.linear_layer = nn.Linear(hp.length_predictor_hidden, 1) self.tanh = nn.Tanh()
def __init__(self, use_postnet=True): super(FastSpeech2, self).__init__() # self.gst = GST() self.encoder = Encoder() self.variance_adaptor = VarianceAdaptor() self.decoder = Decoder() if hp.vocoder=='WORLD': # self.f0_decoder= Decoder() self.ap_linear = nn.Linear(hp.decoder_hidden, hp.n_ap_channels) self.sp_linear = nn.Linear(hp.decoder_hidden, hp.n_sp_channels) else: self.mel_linear = nn.Linear(hp.decoder_hidden, hp.n_mel_channels) self.use_postnet = use_postnet if self.use_postnet: self.postnet = PostNet()
def __init__(self, src_vocab_size, tgt_vocab_size, n_layer=6, d_model=512, d_ff=2048, n_head=8, dropout=0.1): super(Transformer, self).__init__() self.src_embed = nn.Sequential(Embeddings(d_model, src_vocab_size), PositionalEncoding(d_model, dropout)) self.tgt_embed = nn.Sequential(Embeddings(d_model, tgt_vocab_size), PositionalEncoding(d_model, dropout)) self.encoder = Encoder(n_head, d_model, d_ff, dropout, n_layer) self.decoder = Decoder(n_head, d_model, d_ff, dropout, n_layer) self.generator = Generator(d_model, tgt_vocab_size) # Initialize parameters with Glorot / fan_avg. for p in self.parameters(): if p.dim() > 1: nn.init.xavier_uniform_(p)
def __init__(self, bert_model_path, n_tgt_vocab, len_max_seq, d_word_vec=768, d_model=768, d_inner=3072, n_layers=12, n_head=12, d_k=64, d_v=64, dropout=0.1): super().__init__() self.bert = BertModel.from_pretrained(bert_model_path) self.config = BertConfig(bert_model_path + 'bert_config.json') self.decoder = Decoder(n_tgt_vocab=n_tgt_vocab, len_max_seq=len_max_seq, d_word_vec=d_word_vec, d_model=d_model, d_inner=d_inner, n_layers=n_layers, n_head=n_head, d_k=d_k, d_v=d_v, dropout=dropout) self.tgt_word_prj = nn.Linear(d_model, n_tgt_vocab, bias=False) nn.init.xavier_normal_(self.tgt_word_prj.weight) self.tgt_word_prj.weight = self.decoder.tgt_word_emb.weight self.x_logit_scale = (d_model**-0.5) self.o_l = nn.Linear(d_model, 512, bias=False) self.h_l = nn.Linear(512, 1, bias=True) nn.init.xavier_normal_(self.o_l.weight) nn.init.xavier_normal_(self.h_l.weight) self.a_l_1 = nn.Linear(d_model, 512, bias=False) self.a_l_2 = nn.Linear(d_model, 512, bias=False) nn.init.xavier_normal_(self.a_l_1.weight) nn.init.xavier_normal_(self.a_l_2.weight)
def __init__(self,n_layers): super(VariancePredictor, self).__init__() self.decoder = Decoder(n_layers) self.linear_layer = nn.Linear(hp.decoder_hidden, 1)
def __init__(self,n_layers): super(LengthPredictor, self).__init__() self.decoder = Decoder(n_layers) self.linear_layer = nn.Linear(hp.decoder_hidden, 1) self.tanh=nn.Tanh()
def main(): ''' Main function''' parser = argparse.ArgumentParser() parser.add_argument('-epoch', type=int, default=20) parser.add_argument('-batch_size', type=int, default=32) parser.add_argument('-d_model', type=int, default=64) parser.add_argument('-d_inner_hid', type=int, default=64) parser.add_argument('-d_k', type=int, default=64) parser.add_argument('-d_v', type=int, default=64) parser.add_argument('-window_size', type=int, default=3) parser.add_argument('-finit', type=int, default=0) parser.add_argument('-n_head', type=int, default=8) parser.add_argument('-n_warmup_steps', type=int, default=1000) parser.add_argument('-dropout', type=float, default=0.1) parser.add_argument('-embs_share_weight', action='store_true') parser.add_argument('-proj_share_weight', action='store_true') parser.add_argument('-save_model', default='Lastfm_test') parser.add_argument('-no_cuda', action='store_true') opt = parser.parse_args() opt.cuda = not opt.no_cuda opt.d_word_vec = opt.d_model #========= Preparing DataLoader =========# train_data = DataLoader(use_valid=False, load_dict=True, batch_size=opt.batch_size, cuda=opt.cuda) opt.user_size = train_data.user_size #========= Preparing Model =========# decoder = Decoder(opt.user_size, d_k=opt.d_k, d_v=opt.d_v, d_model=opt.d_model, d_word_vec=opt.d_word_vec, d_inner_hid=opt.d_inner_hid, n_head=opt.n_head, kernel_size=opt.window_size, dropout=opt.dropout) optimizer = ScheduledOptim( optim.Adam(decoder.parameters(), betas=(0.9, 0.98), eps=1e-09), opt.d_model, opt.n_warmup_steps) def get_criterion(user_size): ''' With PAD token zero weight ''' weight = torch.ones(user_size) weight[Constants.PAD] = 0 weight[Constants.EOS] = 0 return nn.CrossEntropyLoss(weight, size_average=False) crit = get_criterion(train_data.user_size) if opt.cuda: decoder = decoder.cuda() crit = crit.cuda() train(decoder, train_data, crit, optimizer, opt)
else: length_regulator_output, decoder_pos = self.length_regulator( encoder_output, encoder_mask, alpha=alpha) decoder_output = self.decoder(length_regulator_output, decoder_pos) mel_output = self.mel_linear(decoder_output) mel_output_postnet = self.postnet(mel_output) + mel_output return mel_output, mel_output_postnet if __name__ == "__main__": # Test test_encoder = Encoder() test_decoder = Decoder() # print(test_encoder) # print(test_decoder) test_src = torch.stack([ torch.Tensor([1, 2, 4, 3, 2, 5, 0, 0]), torch.Tensor([3, 4, 2, 6, 7, 1, 2, 3]) ]).long() test_pos = torch.stack([ torch.Tensor([1, 2, 3, 4, 5, 6, 0, 0]), torch.Tensor([1, 2, 3, 4, 5, 6, 7, 8]) ]).long() test_target = torch.stack([ torch.Tensor([0, 2, 3, 0, 3, 2, 1, 0]), torch.Tensor([1, 2, 3, 2, 2, 0, 3, 6]) ])