def create_model(hparams): # Model config with open(hparams.tacotron_config, 'r') as f: model_cfg = json.load(f) if hparams.tacotron_version == "1": # Tacotron model model = Tacotron(n_vocab=hparams.num_symbols, embed_dim=hparams.symbols_embed_dim, mel_dim=hparams.mel_dim, linear_dim=hparams.mel_dim, max_decoder_steps=hparams.max_decoder_steps, stop_threshold=hparams.stop_threshold, r=hparams.r, model_cfg=model_cfg) # Loss criterion criterion = TacotronLoss() elif hparams.tacotron_version == "2": # Tacotron2 model model = Tacotron2(n_vocab=hparams.num_symbols, embed_dim=hparams.symbols_embed_dim, mel_dim=hparams.mel_dim, max_decoder_steps=hparams.max_decoder_steps, stop_threshold=hparams.stop_threshold, r=hparams.r, model_cfg=model_cfg) # Loss criterion criterion = Tacotron2Loss() else: raise ValueError("Unsupported Tacotron version: {} ".format( hparams.tacotron_version)) # return model, criterion
def evluate(path1, path2): #读取测试集的文本数据 input_ids, vocab_inp_size = dataset_txt(path1) input_ids = tf.convert_to_tensor(input_ids) #读取测试机的音频数据 mel_gts, mel_len_wav = dataset_wave(path2, config) vocab_inp_size = 55 # 模型初始化 tacotron2 = Tacotron2(vocab_inp_size, config) path = './training_checkpoints2' load_checkpoint(tacotron2, path) print('已恢复至最新的检查点!') for i in range(input_ids.shape[0]): new_input_ids = input_ids[i] new_input_ids = tf.expand_dims(new_input_ids, axis=0) mel_outputs, mel_outputs_postnet, gate_outputs, alignments = tacotron2.inference( new_input_ids) mel2 = mel_gts[i] mel2 = tf.expand_dims(mel2, axis=0) print("欧式距离为:") compute_distence(mel_outputs_postnet, mel2)
#取数据 input_ids,vocab_inp_size = dataset_txt(text_train_path) input_ids = tf.convert_to_tensor(input_ids) mel_gts,mel_len_wav = dataset_wave(wave_train_path,config) # 生成stop_token的参照值 tar_token = tar_stop_token(mel_len_wav, mel_gts, config.max_len) tar_token = tf.convert_to_tensor(tar_token) #生成真实的声音 mel_gts = tf.transpose(mel_gts, [0, 2, 1]) wav = melspectrogram2wav(mel_gts[0].numpy()) sr = 22050 wave.write('真实1.wav', rate=sr, data=wav) #建立输入输出流 dataset,steps_per_epoch = create_dataset(batch_size, input_ids, mel_gts, tar_token) # 初始化模型和优化器 tacotron2 = Tacotron2(vocab_inp_size, config) optimizer = tf.keras.optimizers.Adam(lr=0.0001) #检查点 checkpoint_dir = './training_checkpoints2' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") checkpoint = tf.train.Checkpoint(tacotron2=tacotron2) #训练 epochs = 2 mel_outputs = train(tacotron2, optimizer, dataset, epochs, steps_per_epoch, checkpoint) #生成预测的声音 wav = melspectrogram2wav(mel_outputs[0].numpy()) sr = 22050 wave.write('预测1.wav', rate=sr, data=wav) #画图 plt.figure() mel_gts = tf.transpose(mel_gts, [0, 2, 1])
def train(dataset_dir, log_dir, load_path=None): # init Tacotron2 model = Tacotron2() # init loss fn criterion = Tacotron2Loss() # init optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.05) epoch = 0 max_epoch = 100 iteration = 1 save_iters = 100 if load_path is not None: model, optimizer, iteration = load_model(load_path, model, optimizer) # init lr scheduler lr_lambda = lambda step: 4000**0.5 * min((step + 1) * 4000**-1.5, (step + 1)**-0.5) if load_path is not None: scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda, last_epoch=iteration) else: scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lr_lambda) # prepare data loader dataset = SpeechDataset(dataset_dir) collate_fn = SpeechCollate() batch_size = 2 dataloader = DataLoader(dataset, num_workers=0, shuffle=True, batch_size=batch_size, drop_last=True, collate_fn=collate_fn) # change train mode model.train() while epoch < max_epoch: total_loss = 0 for batch in dataloader: stime = time() mel_padded, output_lengths, text_padded, input_lengths = batch mel_predict = model((text_padded.long(), input_lengths.long(), mel_padded.float(), output_lengths.long())) loss, loss_item = criterion(mel_predict, mel_padded) total_loss += loss_item model.zero_grad() loss.backward() optimizer.step() scheduler.step() dur_time = time() - stime lr = optimizer.param_groups[0]['lr'] print( 'epoch : {}, iteration : {}, loss : {:.8f}, time : {:.1f}s/it (lr : {})' .format(epoch + 1, iteration, loss_item, dur_time, lr)) if iteration % save_iters == 0: save_model(log_dir, model, optimizer, iteration) mel_output = mel_predict[0].detach().numpy().astype(np.float32) mel_target = mel_padded[0].detach().numpy().astype(np.float32) png_path = os.path.join(log_dir, 'mel_{}.png'.format(iteration)) save_png((mel_output, mel_target), png_path) iteration += 1 epoch += 1