Esempio n. 1
0
def create_model(hparams):
    # Model config
    with open(hparams.tacotron_config, 'r') as f:
        model_cfg = json.load(f)
    if hparams.tacotron_version == "1":
        # Tacotron model
        model = Tacotron(n_vocab=hparams.num_symbols,
                         embed_dim=hparams.symbols_embed_dim,
                         mel_dim=hparams.mel_dim,
                         linear_dim=hparams.mel_dim,
                         max_decoder_steps=hparams.max_decoder_steps,
                         stop_threshold=hparams.stop_threshold,
                         r=hparams.r,
                         model_cfg=model_cfg)
        # Loss criterion
        criterion = TacotronLoss()
    elif hparams.tacotron_version == "2":
        # Tacotron2 model
        model = Tacotron2(n_vocab=hparams.num_symbols,
                          embed_dim=hparams.symbols_embed_dim,
                          mel_dim=hparams.mel_dim,
                          max_decoder_steps=hparams.max_decoder_steps,
                          stop_threshold=hparams.stop_threshold,
                          r=hparams.r,
                          model_cfg=model_cfg)
        # Loss criterion
        criterion = Tacotron2Loss()
    else:
        raise ValueError("Unsupported Tacotron version: {} ".format(
            hparams.tacotron_version))
    #
    return model, criterion
Esempio n. 2
0
def evluate(path1, path2):
    #读取测试集的文本数据
    input_ids, vocab_inp_size = dataset_txt(path1)
    input_ids = tf.convert_to_tensor(input_ids)
    #读取测试机的音频数据
    mel_gts, mel_len_wav = dataset_wave(path2, config)
    vocab_inp_size = 55
    # 模型初始化
    tacotron2 = Tacotron2(vocab_inp_size, config)
    path = './training_checkpoints2'
    load_checkpoint(tacotron2, path)
    print('已恢复至最新的检查点!')
    for i in range(input_ids.shape[0]):
        new_input_ids = input_ids[i]
        new_input_ids = tf.expand_dims(new_input_ids, axis=0)
        mel_outputs, mel_outputs_postnet, gate_outputs, alignments = tacotron2.inference(
            new_input_ids)
        mel2 = mel_gts[i]
        mel2 = tf.expand_dims(mel2, axis=0)
        print("欧式距离为:")
        compute_distence(mel_outputs_postnet, mel2)
Esempio n. 3
0
 #取数据
 input_ids,vocab_inp_size = dataset_txt(text_train_path)
 input_ids = tf.convert_to_tensor(input_ids)
 mel_gts,mel_len_wav = dataset_wave(wave_train_path,config)
 # 生成stop_token的参照值
 tar_token = tar_stop_token(mel_len_wav, mel_gts, config.max_len)
 tar_token = tf.convert_to_tensor(tar_token)
 #生成真实的声音
 mel_gts = tf.transpose(mel_gts, [0, 2, 1])
 wav = melspectrogram2wav(mel_gts[0].numpy())
 sr = 22050
 wave.write('真实1.wav', rate=sr, data=wav)
 #建立输入输出流
 dataset,steps_per_epoch = create_dataset(batch_size, input_ids, mel_gts, tar_token)
 # 初始化模型和优化器
 tacotron2 = Tacotron2(vocab_inp_size, config)
 optimizer = tf.keras.optimizers.Adam(lr=0.0001)
 #检查点
 checkpoint_dir = './training_checkpoints2'
 checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
 checkpoint = tf.train.Checkpoint(tacotron2=tacotron2)
 #训练
 epochs = 2
 mel_outputs = train(tacotron2, optimizer, dataset, epochs, steps_per_epoch, checkpoint)
 #生成预测的声音
 wav = melspectrogram2wav(mel_outputs[0].numpy())
 sr = 22050
 wave.write('预测1.wav', rate=sr, data=wav)
 #画图
 plt.figure()
 mel_gts = tf.transpose(mel_gts, [0, 2, 1])
Esempio n. 4
0
def train(dataset_dir, log_dir, load_path=None):
    # init Tacotron2
    model = Tacotron2()

    # init loss fn
    criterion = Tacotron2Loss()

    # init optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
    epoch = 0
    max_epoch = 100
    iteration = 1
    save_iters = 100

    if load_path is not None:
        model, optimizer, iteration = load_model(load_path, model, optimizer)

    # init lr scheduler
    lr_lambda = lambda step: 4000**0.5 * min((step + 1) * 4000**-1.5,
                                             (step + 1)**-0.5)
    if load_path is not None:
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lr_lambda,
                                                      last_epoch=iteration)
    else:
        scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer,
                                                      lr_lambda=lr_lambda)

    # prepare data loader
    dataset = SpeechDataset(dataset_dir)
    collate_fn = SpeechCollate()

    batch_size = 2
    dataloader = DataLoader(dataset,
                            num_workers=0,
                            shuffle=True,
                            batch_size=batch_size,
                            drop_last=True,
                            collate_fn=collate_fn)

    # change train mode
    model.train()

    while epoch < max_epoch:
        total_loss = 0
        for batch in dataloader:
            stime = time()
            mel_padded, output_lengths, text_padded, input_lengths = batch
            mel_predict = model((text_padded.long(), input_lengths.long(),
                                 mel_padded.float(), output_lengths.long()))

            loss, loss_item = criterion(mel_predict, mel_padded)
            total_loss += loss_item
            model.zero_grad()
            loss.backward()
            optimizer.step()
            scheduler.step()

            dur_time = time() - stime
            lr = optimizer.param_groups[0]['lr']
            print(
                'epoch : {}, iteration : {}, loss : {:.8f}, time : {:.1f}s/it (lr : {})'
                .format(epoch + 1, iteration, loss_item, dur_time, lr))

            if iteration % save_iters == 0:
                save_model(log_dir, model, optimizer, iteration)
                mel_output = mel_predict[0].detach().numpy().astype(np.float32)
                mel_target = mel_padded[0].detach().numpy().astype(np.float32)
                png_path = os.path.join(log_dir,
                                        'mel_{}.png'.format(iteration))
                save_png((mel_output, mel_target), png_path)

            iteration += 1
        epoch += 1