def decode_RNN(feat_list, gpu, cvlist=None, mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\ mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\ mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \ lat_dist_rmse_list=None, lat_dist_cosim_list=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder) model_post = GRU_POST_NET( spec_dim=config.mcep_dim, excit_dim=2, n_spk=n_spk, hidden_layers=config.hidden_layers_post, hidden_units=config.hidden_units_post, kernel_size=config.kernel_size_post, dilation_size=config.dilation_size_post, causal_conv=config.causal_conv_post, pad_first=True, right_size=config.right_size_post) #excit_dim=config.excit_dim, #excit_dim=None, logging.info(model_post) model_encoder.load_state_dict(torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict(torch.load(args.model)["model_decoder"]) model_post.load_state_dict(torch.load(args.model)["model_post"]) model_encoder.remove_weight_norm() model_decoder.remove_weight_norm() model_post.remove_weight_norm() model_encoder.cuda() model_decoder.cuda() model_post.cuda() model_encoder.eval() model_decoder.eval() model_post.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_post.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2 pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2 outpad_lefts = [None]*5 outpad_rights = [None]*5 outpad_lefts[0] = pad_left-model_encoder.pad_left outpad_rights[0] = pad_right-model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left outpad_rights[2] = outpad_rights[1]-model_post.pad_right outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right logging.info(f'{pad_left} {pad_right}') logging.info(outpad_lefts) logging.info(outpad_rights) for feat_file in feat_list: # convert mcep spk_src = os.path.basename(os.path.dirname(feat_file)) src_idx = spk_list.index(spk_src) logging.info('%s --> %s' % (spk_src, args.spk_trg)) file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file)) trg_exist = False if os.path.exists(file_trg): logging.info('exist: %s' % (file_trg)) feat_trg = read_hdf5(file_trg, config.string_path) mcep_trg = feat_trg[:,-config.mcep_dim:] logging.info(mcep_trg.shape) trg_exist = True feat_org = read_hdf5(feat_file, config.string_path) mcep = np.array(feat_org[:,-config.mcep_dim:]) codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim]))) sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl)) ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl) feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1) feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]] logging.info("generate") with torch.no_grad(): feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2) feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0) feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0) spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1)) if trg_exist: spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \ (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False) logging.info('target spkpost') logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1)) cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src) cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src) cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(), e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), sampling=False) logging.info('cv spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv) cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) if outpad_rights[2] > 0: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) else: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64) if trg_exist: if outpad_rights[0] > 0: lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]] else: lat_src = lat_src[:,outpad_lefts[0]:] logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) logging.info(cvmcep_cyc.shape) if trg_exist: logging.info(lat_src.shape) logging.info(lat_trg.shape) cvlist.append(np.var(cvmcep[:,1:], axis=0)) logging.info("cvf0lin") f0_range = read_hdf5(feat_file, "/f0_range") cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std) uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin)) unique, counts = np.unique(uv_range_lin, return_counts=True) logging.info(dict(zip(unique, counts))) cont_f0_lpf_range_lin = \ low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20) uv_range_lin = np.expand_dims(uv_range_lin, axis=-1) cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1) # plain converted feat for neural vocoder feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep] logging.info(feat_cv.shape) logging.info("mcd acc") spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) if trg_exist: spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0]) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) spcidx_src = torch.LongTensor(spcidx).cuda() spcidx_trg = torch.LongTensor(spcidx_trg).cuda() trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64) trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64) aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg) lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0))) _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0) aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src) lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0))) _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0) logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc)) lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2 lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2 lat_dist_rmse_list.append(lat_dist_rmse) lat_dist_cosim_list.append(lat_dist_cosim) logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_cyc.append(mcdpow_mean) mcdpowstd_cvlist_cyc.append(mcdpow_std) mcd_cvlist_cyc.append(mcd_mean) mcdstd_cvlist_cyc.append(mcd_std) logging.info("synth anasyn") wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco rec") cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl) logging.info(cvsp_src.shape) wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv") cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl) logging.info(cvsp.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv GV") datamean = np.mean(cvmcep[:,1:], axis=0) cvmcep_gv = np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \ (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]] cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN) cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl) logging.info(cvsp_gv.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) #logging.info("synth diffGV") #shiftl = int(args.fs/1000*args.shiftms) #mc_cv_diff = cvmcep_gv-mcep #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha) #logging.info(b.shape) #assert np.isfinite(b).all #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl) #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav"))) #assert(fs_ == args.fs) #wav = mlsa_fil.synthesis(x, b) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("synth diffGVF0") #time_axis = read_hdf5(feat_file, "/time_axis") #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff.shape) #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(ap_diff.shape) #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("analysis diffGVF0") #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff_anasyn.shape) #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha) #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs) ## convert to continouos codeap with uv #for i in range(code_ap_diff_anasyn.shape[-1]): # logging.info('codeap: %d' % (i+1)) # uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i])) # cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP)) # if i > 0: # cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)] # else: # uv_codeap = np.expand_dims(uv_codeap_i, axis=-1) # cont_codeap = np.expand_dims(cont_codeap_i, axis=-1) # uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1) # unique, counts = np.unique(uv_codeap_i, return_counts=True) # logging.info(dict(zip(unique, counts))) ## postprocessed converted feat for neural vocoder #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn] #logging.info("write lat") #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file))) #if not os.path.exists(outTxtDir): # os.mkdir(outTxtDir) #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt")) #logging.info(outTxt) #g = open(outTxt, "wt") #idx_frm = 0 #nfrm = trj_lat_src.shape[0] #dim = trj_lat_src.shape[1] #if not args.time_flag: ##if True: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # g.write("%lf " % (elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #else: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # if idx_elmt > 1: # g.write("%lf " % (elmt)) # else: # g.write("%lf %lf " % (time_axis[idx_frm], elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #g.close() logging.info('write to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) # cv write_path = args.string_path logging.info(feat_file + ' ' + write_path) logging.info(feat_cv.shape) write_hdf5(feat_file, write_path, feat_cv) ## diffGVF0 #write_path = args.string_path+"_diffgvf0" #logging.info(feat_file + ' ' + write_path) #logging.info(feat_diffgv_anasyn.shape) #write_hdf5(feat_file, write_path, feat_diffgv_anasyn) count += 1
def decode_RNN(wav_list, gpu, cvlist=None, cvlist_src=None, \ mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None): with torch.cuda.device(gpu): mean_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/mean_feat_org_lf0")[config.stdim:]).cuda() std_trg = torch.FloatTensor( read_hdf5(args.stats_jnt, "/scale_feat_org_lf0")[config.stdim:]).cuda() # define model and load parameters logging.info(config) logging.info("model") with torch.no_grad(): model_encoder = GRU_RNN_STOCHASTIC( in_dim=config.in_dim, out_dim=config.lat_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, arparam=config.arparam, spk_dim=n_spk, causal_conv=config.causal_conv, scale_out_flag=False) model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk, out_dim=config.out_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv, scale_in_flag=False) logging.info(model_encoder) logging.info(model_decoder) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_encoder.cuda() model_decoder.cuda() model_encoder.eval() model_decoder.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False if config.arparam: init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk)) else: init_pp = np.zeros((1, 1, config.lat_dim + n_spk)) y_in_pp = torch.FloatTensor(init_pp).cuda() y_in_src = y_in_trg = torch.unsqueeze( torch.unsqueeze((0 - mean_trg) / std_trg, 0), 0) fs = args.fs fft_size = args.fftl mcep_dim = model_decoder.out_dim - 1 for wav_file in wav_list: # convert mcep feat_file = os.path.join( args.h5outdir, os.path.basename(wav_file).replace(".wav", ".h5")) logging.info("cvmcep " + feat_file + " " + wav_file) fs, x = read_wav(wav_file, cutoff=70) time_axis, f0, sp, ap = analyze_range(x, fs=fs, minf0=args.minf0, maxf0=args.maxf0, \ fperiod=args.shiftms, fftl=args.fftl) logging.info(sp.shape) mcep = ps.sp2mc(sp, mcep_dim, args.mcep_alpha) logging.info(mcep.shape) codeap = pw.code_aperiodicity(ap, fs) logging.info(codeap.shape) npow = spc2npow(sp) logging.info(npow.shape) _, spcidx = extfrm(mcep, npow, power_threshold=args.pow) spcidx = spcidx[0] logging.info(spcidx.shape) uv, contf0 = convert_continuos_f0(np.array(f0)) uv = np.expand_dims(uv, axis=-1) logging.info(uv.shape) cont_f0_lpf = low_pass_filter(contf0, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0 = np.expand_dims(np.log(cont_f0_lpf), axis=-1) logging.info(logcontf0.shape) feat = np.c_[uv, logcontf0, codeap, mcep] logging.info(feat.shape) logging.info("generate") with torch.no_grad(): lat_feat_src, _, _, _, _ = \ model_encoder(torch.FloatTensor(feat).cuda(), y_in_pp, sampling=False) src_code = np.zeros((lat_feat_src.shape[0], n_spk)) src_code[:, src_code_idx] = 1 src_code = torch.FloatTensor(src_code).cuda() trg_code = np.zeros((lat_feat_src.shape[0], n_spk)) trg_code[:, trg_code_idx] = 1 trg_code = torch.FloatTensor(trg_code).cuda() cvmcep_src, _, _ = model_decoder( torch.cat((src_code, lat_feat_src), 1), y_in_src) cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(), dtype=np.float64) cvmcep, _, _ = model_decoder( torch.cat((trg_code, lat_feat_src), 1), y_in_trg) cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64) logging.info(lat_feat_src.shape) logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) cvf0 = convert_f0(f0, f0_range_mean_src, f0_range_std_src, f0_range_mean_trg, f0_range_std_trg) uv_cv, contf0_cv = convert_continuos_f0(np.array(cvf0)) uv_cv = np.expand_dims(uv_cv, axis=-1) logging.info(uv_cv.shape) cont_f0_lpf_cv = low_pass_filter(contf0_cv, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) logcontf0_cv = np.expand_dims(np.log(cont_f0_lpf_cv), axis=-1) logging.info(logcontf0_cv.shape) feat_cv = np.c_[uv_cv, logcontf0_cv, codeap] logging.info(feat_cv.shape) feat_cvmcep = np.c_[feat_cv, cvmcep] logging.info(feat_cvmcep.shape) write_path = '/feat_cvmcep_cycvae-' + model_epoch logging.info(feat_file + ' ' + write_path) write_hdf5(feat_file, write_path, feat_cvmcep) cvlist.append(np.var(cvmcep[:, 1:], axis=0)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),:], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \ np.array(cvmcep_src[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info("synth voco") cvsp = ps.mc2sp(np.ascontiguousarray(cvmcep), args.mcep_alpha, fft_size) logging.info(cvsp.shape) wav = np.clip( pw.synthesize(cvf0, cvsp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_cv.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath) logging.info("synth anasyn") wav = np.clip( pw.synthesize(f0, sp, ap, fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join( args.outdir, os.path.basename(wav_file).replace(".wav", "_anasyn.wav")) sf.write(wavpath, wav, fs, 'PCM_16') logging.info(wavpath)
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None): with torch.cuda.device(gpu): mean_jnt = torch.FloatTensor( read_hdf5(args.stats_jnt, "/mean_feat_org_lf0")[config.stdim:]).cuda() std_jnt = torch.FloatTensor( read_hdf5(args.stats_jnt, "/scale_feat_org_lf0")[config.stdim:]).cuda() # define model and load parameters logging.info("model") logging.info(config) with torch.no_grad(): model_encoder = GRU_RNN_STOCHASTIC( in_dim=config.in_dim, out_dim=config.lat_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size, dilation_size=config.dilation_size, spk_dim=n_spk, scale_out_flag=False) model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk, out_dim=config.out_dim, hidden_layers=config.hidden_layers, hidden_units=config.hidden_units, kernel_size=config.kernel_size, dilation_size=config.dilation_size, scale_in_flag=False) logging.info(model_encoder) logging.info(model_decoder) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_encoder.cuda() model_decoder.cuda() model_encoder.eval() model_decoder.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk)) y_in_pp = torch.FloatTensor(init_pp).cuda() y_in_src = y_in_trg = torch.unsqueeze( torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0) for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat = read_hdf5(feat_file, "/feat_org_lf0") logging.info(feat.shape) f0 = read_hdf5(feat_file, "/f0_range") cvf0 = convert_f0(f0, f0_range_mean_trg, f0_range_std_trg, f0_range_mean_src, f0_range_std_src) cvuv, cont_f0 = convert_continuos_f0(cvf0) cvuv = np.expand_dims(cvuv, axis=-1) cont_f0_lpf = low_pass_filter(cont_f0, int(1.0 / (args.shiftms * 0.001)), cutoff=LP_CUTOFF) if np.min(cont_f0_lpf) <= 0: length = len(cont_f0_lpf) for i in range(length): if cont_f0_lpf[i] <= 0: if i > 0 and i < length - 1: for j in range(i - 1, -1, -1): if cont_f0_lpf[j] > 0: left_val = cont_f0_lpf[j] break for j in range(i + 1, length): if cont_f0_lpf[j] > 0: right_val = cont_f0_lpf[j] break cont_f0_lpf[i] = (left_val + right_val) / 2 elif i == 0: for j in range(1, length): if cont_f0_lpf[j] > 0: right_val = cont_f0_lpf[j] break cont_f0_lpf[i] = right_val else: for j in range(i - 1, -1, -1): if cont_f0_lpf[j] > 0: left_val = cont_f0_lpf[j] break cont_f0_lpf[i] = left_val cvlogf0fil = np.expand_dims(np.log(cont_f0_lpf), axis=-1) feat_cv = np.c_[cvuv, cvlogf0fil, feat[:, 2:config.stdim]] with torch.no_grad(): lat_feat, _, _, _, _ = model_encoder(torch.FloatTensor(feat).cuda(), \ y_in_pp, sampling=False) src_code = np.zeros((lat_feat.shape[0], n_spk)) src_code[:, src_code_idx] = 1 src_code = torch.FloatTensor(src_code).cuda() cvmcep, _, _ = model_decoder( torch.cat((src_code, lat_feat), 1), y_in_src) lat_feat, _, _, _, _ = model_encoder(torch.cat((torch.FloatTensor(feat_cv).cuda(), cvmcep),1), \ y_in_pp, sampling=False) trg_code = np.zeros((lat_feat.shape[0], n_spk)) trg_code[:, trg_code_idx] = 1 trg_code = torch.FloatTensor(trg_code).cuda() cvmcep, _, _ = model_decoder( torch.cat((trg_code, lat_feat), 1), y_in_trg) cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep.shape) mcep = feat[:, config.stdim:] spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \ np.array(cvmcep[np.array(spcidx),:], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \ np.array(cvmcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep[:, 1:], axis=0)) logging.info(len(cvlist)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep[:, 1:], axis=0)) logging.info(len(cvlist_dv)) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) feat_cvmcep = np.c_[feat[:, :config.stdim], cvmcep] logging.info(feat_cvmcep.shape) write_path = '/feat_recmcep_cycvae-' + model_epoch outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), spk_trg + "-" + spk_src + "-" + spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file_cyc = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file_cyc + ' ' + write_path) write_hdf5(feat_file_cyc, write_path, feat_cvmcep)