def tts_predict(model, ppg, id_speaker):
    # 准备输入的数据并转换到GPU
    ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float()
    id_speaker = torch.LongTensor([id_speaker])
    print('orig:', id_speaker)
    print(id_speaker.shape)
    # id_speaker = id_speaker.unsqueeze(0)
    print(ppg.size())
    print(ppg.shape)
    print(ppg.type())
    print('---------- id_speaker')
    print(id_speaker.size())
    print(id_speaker.shape)
    print(id_speaker.type())
    print(id_speaker)
    if use_cuda:
        ppg = ppg.cuda()
        id_speaker = id_speaker.cuda()

    # 进行预测并数据转换到CPU
    mel_pred, spec_pred = model(ppg, id_speaker)
    mel_pred = mel_pred[0].cpu().data.numpy()
    spec_pred = spec_pred[0].cpu().data.numpy()

    # vocoder合成音频波形文件
    mel_pred_audio = normalized_db_mel2wav(mel_pred)
    spec_pred_audio = normalized_db_spec2wav(spec_pred)

    return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
Example #2
0
def main():
    #这一部分用于处理LJSpeech格式的数据集
    a = open(meta_path, 'r').readlines()
    b = []
    i = 0
    while i < len(a):
        t = a[i][0:6]
        b.append(t)
        i += 2
    print(b[:2])
    a = b
    # a = [i.strip().split('|')[0] for i in a]
    cnt = 0
    cnt_list = []
    bad_cnt = 0
    bad_list = []
    for fname in tqdm(a):
        try:
            # 提取声学参数
            wav_f = os.path.join(wav_dir, fname + '.wav')
            wav_arr = load_wav(wav_f)
            mfcc_feats = wav2unnormalized_mfcc(wav_arr)
            mel_feats = wav2normalized_db_mel(wav_arr)
            spec_feats = wav2normalized_db_spec(wav_arr)
            
            # 验证声学参数提取的对
            save_name = fname + '.npy'
            save_mel_rec_name = fname + '_mel_rec.wav'
            save_spec_rec_name = fname + '_spec_rec.wav'
            # 这句话有可能错,不知道为什么,可能是服务器临时变动有关
            ppg_already_feats = np.load(os.path.join(ppg_dir, save_name))

            assert ppg_already_feats.shape[0] == mfcc_feats.shape[0]
            assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0]
            write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats))
            write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats))
            
            # 存储声学参数
            mfcc_save_name = os.path.join(mfcc_dir, save_name)
            mel_save_name = os.path.join(mel_dir, save_name)
            spec_save_name = os.path.join(spec_dir, save_name)
            np.save(mfcc_save_name, mfcc_feats)
            np.save(mel_save_name, mel_feats)
            np.save(spec_save_name, spec_feats)

            f_good_meta.write(fname + '\n')
            cnt_list.append(fname)
            cnt += 1
        except:
            bad_list.append(fname)
            bad_cnt += 1
        
        # print(cnt)
        # break

    print(cnt)
    print('bad:', bad_cnt)
    print(bad_list)

    return
def tts_predict(model, ppg):
    # 准备输入的数据并转换到GPU
    ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float()
    if use_cuda:
        ppg = ppg.cuda()

    # 进行预测并数据转换到CPU
    mel_pred, spec_pred = model(ppg)
    mel_pred = mel_pred[0].cpu().data.numpy()
    spec_pred = spec_pred[0].cpu().data.numpy()

    # vocoder合成音频波形文件
    mel_pred_audio = normalized_db_mel2wav(mel_pred)
    spec_pred_audio = normalized_db_spec2wav(spec_pred)

    return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio