def main(): # en_ppg_l, en_linear_l = for_loop_en() #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...] cn_ppg_l, cn_linear_l = for_loop_cn() #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...] all_ppg_l = en_ppg_l + cn_ppg_l #中英文混合后的ppg的列表 # en_final_cn_idx = np.load(en_final_cn_idx_path) # en_file_list = en_text2list(file=en_raw_list_path) # en_ppgs_ls = [] now = 0 for f in tqdm(en_file_list): wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir) e_ppg_id = [] # 英文从零开始 for i in range(wav_ppgs.shape[0]): e_ppg_id.append(now) now += 1 print('en id from 0:', e_ppg_id[:10]) c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始 # 找到linear c_lineas_projected = list() for i in c_ppg_id_projected: c_lineas_projected.append(cn_linear_l[i]) c_lineas_projected = np.asarray(c_lineas_projected) save_linear_name = f + '_cn_linear_projected.wav' write_wav(os.path.join(projected_wav_dir, save_linear_name), normalized_db_spec2wav(c_lineas_projected)) save_linear_original_name = f + '_en_linear_original.wav' write_wav(os.path.join(projected_wav_dir, save_linear_original_name), normalized_db_spec2wav(linears))
def generate_pair_wav(spec, spec_pred, log_dir, global_step, suffix_name): y_pred = normalized_db_spec2wav(spec_pred) pred_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.wav") write_wav(pred_wav_path, y_pred) pred_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_predvalidation.npy") np.save(pred_spec_path, spec_pred) y = normalized_db_spec2wav(spec) orig_wav_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.wav") write_wav(orig_wav_path, y) orig_spec_path = os.path.join(log_dir, "step_" + str(global_step) + "_" + suffix_name + "_original.npy") np.save(orig_spec_path, spec)
def tts_predict(model, ppg, id_speaker): # 准备输入的数据并转换到GPU ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float() id_speaker = torch.LongTensor([id_speaker]) print('orig:', id_speaker) print(id_speaker.shape) # id_speaker = id_speaker.unsqueeze(0) print(ppg.size()) print(ppg.shape) print(ppg.type()) print('---------- id_speaker') print(id_speaker.size()) print(id_speaker.shape) print(id_speaker.type()) print(id_speaker) if use_cuda: ppg = ppg.cuda() id_speaker = id_speaker.cuda() # 进行预测并数据转换到CPU mel_pred, spec_pred = model(ppg, id_speaker) mel_pred = mel_pred[0].cpu().data.numpy() spec_pred = spec_pred[0].cpu().data.numpy() # vocoder合成音频波形文件 mel_pred_audio = normalized_db_mel2wav(mel_pred) spec_pred_audio = normalized_db_spec2wav(spec_pred) return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
def main(): #这一部分用于处理LJSpeech格式的数据集 a = open(meta_path, 'r').readlines() b = [] i = 0 while i < len(a): t = a[i][0:6] b.append(t) i += 2 print(b[:2]) a = b # a = [i.strip().split('|')[0] for i in a] cnt = 0 cnt_list = [] bad_cnt = 0 bad_list = [] for fname in tqdm(a): try: # 提取声学参数 wav_f = os.path.join(wav_dir, fname + '.wav') wav_arr = load_wav(wav_f) mfcc_feats = wav2unnormalized_mfcc(wav_arr) mel_feats = wav2normalized_db_mel(wav_arr) spec_feats = wav2normalized_db_spec(wav_arr) # 验证声学参数提取的对 save_name = fname + '.npy' save_mel_rec_name = fname + '_mel_rec.wav' save_spec_rec_name = fname + '_spec_rec.wav' # 这句话有可能错,不知道为什么,可能是服务器临时变动有关 ppg_already_feats = np.load(os.path.join(ppg_dir, save_name)) assert ppg_already_feats.shape[0] == mfcc_feats.shape[0] assert mfcc_feats.shape[0] == mel_feats.shape[0] and mel_feats.shape[0] == spec_feats.shape[0] write_wav(os.path.join(rec_wav_dir, save_mel_rec_name), normalized_db_mel2wav(mel_feats)) write_wav(os.path.join(rec_wav_dir, save_spec_rec_name), normalized_db_spec2wav(spec_feats)) # 存储声学参数 mfcc_save_name = os.path.join(mfcc_dir, save_name) mel_save_name = os.path.join(mel_dir, save_name) spec_save_name = os.path.join(spec_dir, save_name) np.save(mfcc_save_name, mfcc_feats) np.save(mel_save_name, mel_feats) np.save(spec_save_name, spec_feats) f_good_meta.write(fname + '\n') cnt_list.append(fname) cnt += 1 except: bad_list.append(fname) bad_cnt += 1 # print(cnt) # break print(cnt) print('bad:', bad_cnt) print(bad_list) return
def eval_model_generate(spec, spec_pred, length, log_dir, global_step): print("EVAL LENGTH:", length) print("EVAL SPEC PRED SHAPE:", spec_pred.shape) y_pred = normalized_db_spec2wav(spec_pred) pred_wav_path = os.path.join(log_dir, "checkpoint_step_{}_pred.wav".format(global_step)) write_wav(pred_wav_path, y_pred) pred_spec_path = os.path.join(log_dir, "checkpoint_step_{}_pred_spec.npy".format(global_step)) np.save(pred_spec_path, spec_pred) print("EVAL LENGTH:", length) print("EVAL SPEC SHAPE:", spec.shape) y = normalized_db_spec2wav(spec) orig_wav_path = os.path.join(log_dir, "checkpoint_step_{}_original.wav".format(global_step)) write_wav(orig_wav_path, y) orig_spec_path = os.path.join(log_dir, "checkpoint_step_{}_orig_spec.npy".format(global_step)) np.save(orig_spec_path, spec)
def tts_predict(model, ppg): # 准备输入的数据并转换到GPU ppg = Variable(torch.from_numpy(ppg)).unsqueeze(0).float() if use_cuda: ppg = ppg.cuda() # 进行预测并数据转换到CPU mel_pred, spec_pred = model(ppg) mel_pred = mel_pred[0].cpu().data.numpy() spec_pred = spec_pred[0].cpu().data.numpy() # vocoder合成音频波形文件 mel_pred_audio = normalized_db_mel2wav(mel_pred) spec_pred_audio = normalized_db_spec2wav(spec_pred) return mel_pred, spec_pred, mel_pred_audio, spec_pred_audio
def main(): print('start program') program_time = time.time() last_time = time.time() en_ppg_l, en_linear_l = for_loop_en( ) #英文每一帧ppg的列表 en_l = [en_ppg1,en_ppg2,...] cn_ppg_l, cn_linear_l = for_loop_cn( ) #中文每一帧ppg的列表 cn_l = [cn_ppg1,cn_ppg2,...] all_ppg_l = en_ppg_l + cn_ppg_l #中英文混合后的ppg的列表 print('end put ppg in memory, use:', time.time() - last_time) last_time = time.time() print('start cluster...') # 需要快速的聚类 #all_l=[en_ppg1,en_ppg2,...,cn_ppg1,cn_ppg2,...] all_class = cluster_kmeans( all_ppg_l, K_small) #all_class=[en_label,en_label,...,cn_label,cn_label,...] print('end cluster..., k-means use:', time.time() - last_time) last_time = time.time() #... a[100], a[0].1, 2, 3,... class_cn_ppgs = list( ) #建立一个列表class_cn_ppgs,列表中包含K个空列表class_cn_ppg = [[],[],[],...] class_cn_ppgs_value = list() class_cn_ppgs_value_kdtree = list() for i in range(K_small): l = list() class_cn_ppgs.append(l) #append()在列表后面添加元素 l_value = list() class_cn_ppgs_value.append(l_value) # 构造类的信息, 筛选出每个类里都有哪些中文的ppg; 并且平均每个类有100个中文ppg en_ppg_l_len = len(en_ppg_l) for i in range(len(cn_ppg_l)): idx = i + en_ppg_l_len now_class = all_class[idx] #now_class = cn_label 可能是0-1999 class_cn_ppgs[now_class].append( i ) #class_cn_ppg = [[2,8,19,...],[3,48,79,...],[4,5,36,...],...] 2000个类,每个类中含有cn_l中对应帧ppg的序列号 class_cn_ppgs_value[now_class].append(cn_ppg_l[i]) print('prepare for class infomation use:', time.time() - last_time) print('start construct kdtree') all_last_time = time.time() have_cnt = 0 for i in tqdm(range(K_small)): l = len(class_cn_ppgs[i]) if l > 0: have_cnt += 1 print('cluster', i, 'len', l, 'start construct kd-tree') last_time = time.time() class_cn_ppgs_value[i] = np.asarray(class_cn_ppgs_value[i]) class_cn_ppgs_value_kdtree.append( KDTree(class_cn_ppgs_value[i], leaf_size=40)) print('end cluster', i, 'kd-tree use:', time.time() - last_time) print('have class:', have_cnt) print('end construct all kdtrees, tot use:', time.time() - all_last_time) # 开始寻找en对应的类内所有中文ppg离他最近的 print('start get cloest map array for all en ppg') last_time = time.time() en_final_cn_idx = np.zeros( (en_ppg_l_len)) # a[1000000] np.zeros()返回来一个给定形状和类型的用0填充的数组; for i in tqdm(range(en_ppg_l_len)): #遍历英文ppg列表, now_class = all_class[i] #now_class = en_label 可能是0-1999 # 暴力寻找 # ans1, ans_id1 = bruce_find_closest(i, now_class, en_ppg_l, cn_ppg_l, class_cn_ppgs) # k-d tree寻找 ans, ans_id = kdtree_find_closest( i, en_ppg_l, class_cn_ppgs_value_kdtree[now_class], class_cn_ppgs[now_class]) # assert np.absolute(ans1 - ans) < eps and ans_id1 == ans_id en_final_cn_idx[i] = ans_id np.save(en_final_cn_idx_path, en_final_cn_idx) print('end write map array, all use:', time.time() - last_time) # 开始findA的部分 print('start findA') last_time = time.time() en_file_list = en_text2list(file=en_raw_list_path) now = 0 for f in tqdm(en_file_list): wav_ppgs, linears = get_single_data_pair(f, ppgs_dir=en_raw_ppg_path, linears_dir=en_raw_linear_dir) e_ppg_id = [] # 英文从零开始 for i in range(wav_ppgs.shape[0]): e_ppg_id.append(now) now += 1 print('en id from 0:', e_ppg_id[:10]) c_ppg_id_projected = ppg_project(e_ppg_id, en_final_cn_idx) # 从中文的零开始 # 找到ppg并存储 c_ppgs_projected = list() for i in c_ppg_id_projected: c_ppgs_projected.append(cn_ppg_l[i]) c_ppgs_projected = np.asarray(c_ppgs_projected) save_ppg_name_projected = f + '_cn_ppg_projected.npy' np.save(os.path.join(projected_wav_dir, save_ppg_name_projected), c_ppgs_projected) # 找到linear并存储 c_lineas_projected = list() for i in c_ppg_id_projected: c_lineas_projected.append(cn_linear_l[i]) c_lineas_projected = np.asarray(c_lineas_projected) save_linear_name_projected = f + '_cn_linear_projected.npy' np.save(os.path.join(projected_wav_dir, save_linear_name_projected), c_lineas_projected) # 计算音频wav并存储 save_wav_name_projected = f + '_cn_wav_projected.wav' write_wav(os.path.join(projected_wav_dir, save_wav_name_projected), normalized_db_spec2wav(c_lineas_projected)) #----------接下来是original的ppg,linear,wav存储,用来对比----------- # 找到ppg并存储 save_ppg_name_original = f + '_en_ppg_original.npy' np.save(os.path.join(projected_wav_dir, save_ppg_name_original), wav_ppgs) # 找到linear并存储 save_linear_name_original = f + '_en_linear_original.npy' np.save(os.path.join(projected_wav_dir, save_linear_name_original), linears) # 计算音频wav并存储-original save_wav_name_original = f + '_en_wav_original.wav' write_wav(os.path.join(projected_wav_dir, save_wav_name_original), normalized_db_spec2wav(linears)) print('end findA, use:', time.time() - last_time) print('program use:', time.time() - program_time)