def tts_predict(model, ppg, id_speaker): # 准备输入的数据并转换到GPU ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float() id_speaker = torch.LongTensor([id_speaker]) print('orig:', id_speaker) print(id_speaker.shape) # id_speaker = id_speaker.unsqueeze(0) print(ppg.size()) print(ppg.shape) print(ppg.type()) print('---------- id_speaker') print(id_speaker.size()) print(id_speaker.shape) print(id_speaker.type()) print(id_speaker) if use_cuda: ppg = ppg.cuda() id_speaker = id_speaker.cuda() # 进行预测并数据转换到CPU mel_pred, spec_pred = model(ppg, id_speaker) mel_pred = mel_pred[0].cpu().data.numpy() spec_pred = spec_pred[0].cpu().data.numpy() # vocoder合成音频波形文件 mel_pred_audio = normalized_db_mel2wav(mel_pred) spec_pred_audio = normalized_db_spec2wav(spec_pred) return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
def main(): #这一部分用于处理LJSpeech格式的数据集 a = open(meta_path, 'r').readlines() b = [] i = 0 while i < len(a): t = a[i][0:6] b.append(t) i += 2 print(b[:2]) a = b # a = [i.strip().split('|')[0] for i in a] cnt = 0 cnt_list = [] bad_cnt = 0 bad_list = [] for fname in tqdm(a): try: # 提取声学参数 wav_f = os.path.join(wav_dir, fname + '.wav') wav_arr = load_wav(wav_f) mfcc_feats = wav2unnormalized_mfcc(wav_arr) mel_feats = wav2normalized_db_mel(wav_arr) spec_feats = wav2normalized_db_spec(wav_arr) # 验证声学参数提取的对 save_name = fname + '.npy' save_mel_rec_name = fname + '_mel_rec.wav' save_spec_rec_name = fname + '_spec_rec.wav' # 这句话有可能错,不知道为什么,可能是服务器临时变动有关 ppg_already_feats = np.load(os.path.join(ppg_dir, save_name)) assert ppg_already_feats.shape[0] == mfcc_feats.shape[0] assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0] write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats)) write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats)) # 存储声学参数 mfcc_save_name = os.path.join(mfcc_dir, save_name) mel_save_name = os.path.join(mel_dir, save_name) spec_save_name = os.path.join(spec_dir, save_name) np.save(mfcc_save_name, mfcc_feats) np.save(mel_save_name, mel_feats) np.save(spec_save_name, spec_feats) f_good_meta.write(fname + '\n') cnt_list.append(fname) cnt += 1 except: bad_list.append(fname) bad_cnt += 1 # print(cnt) # break print(cnt) print('bad:', bad_cnt) print(bad_list) return
def tts_predict(model, ppg): # 准备输入的数据并转换到GPU ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float() if use_cuda: ppg = ppg.cuda() # 进行预测并数据转换到CPU mel_pred, spec_pred = model(ppg) mel_pred = mel_pred[0].cpu().data.numpy() spec_pred = spec_pred[0].cpu().data.numpy() # vocoder合成音频波形文件 mel_pred_audio = normalized_db_mel2wav(mel_pred) spec_pred_audio = normalized_db_spec2wav(spec_pred) return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio