def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, \ f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None, \ f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, \ f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_mcep = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder_mcep) model_decoder_mcep = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_dec) logging.info(model_decoder_mcep) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=config.bi_lf0, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_f0) logging.info(model_decoder_excit) model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim) logging.info(model_vq) model_encoder_mcep.load_state_dict(torch.load(args.model)["model_encoder_mcep"]) model_decoder_mcep.load_state_dict(torch.load(args.model)["model_decoder_mcep"]) model_encoder_excit.load_state_dict(torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict(torch.load(args.model)["model_decoder_excit"]) model_vq.load_state_dict(torch.load(args.model)["model_vq"]) model_encoder_mcep.cuda() model_decoder_mcep.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() model_vq.cuda() model_encoder_mcep.eval() model_decoder_mcep.eval() model_encoder_excit.eval() model_decoder_excit.eval() model_vq.eval() for param in model_encoder_mcep.parameters(): param.requires_grad = False for param in model_decoder_mcep.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False for param in model_vq.parameters(): param.requires_grad = False if config.ar_enc: yz_in = torch.zeros((1, 1, n_spk+config.lat_dim)).cuda() yz_in_e = torch.zeros((1, 1, n_spk+config.lat_dim_e)).cuda() if config.ar_dec or config.ar_f0: mean_stats = torch.FloatTensor(read_hdf5(config.stats, "/mean_"+config.string_path.replace("/",""))) scale_stats = torch.FloatTensor(read_hdf5(config.stats, "/scale_"+config.string_path.replace("/",""))) if config.ar_dec: x_in = ((torch.zeros((1, 1, config.mcep_dim))-mean_stats[config.excit_dim:])/scale_stats[config.excit_dim:]).cuda() if config.ar_f0: e_in = torch.cat((torch.zeros(1,1,1), (torch.zeros(1,1,1)-mean_stats[1:2])/scale_stats[1:2], \ torch.zeros(1,1,1), (torch.zeros(1,1,config.cap_dim)-mean_stats[3:config.excit_dim])/scale_stats[3:config.excit_dim]), 2).cuda() count = 0 pad_left = (model_encoder_mcep.pad_left + model_decoder_mcep.pad_left)*2 pad_right = (model_encoder_mcep.pad_right + model_decoder_mcep.pad_right)*2 outpad_lefts = [None]*3 outpad_rights = [None]*3 outpad_lefts[0] = pad_left-model_encoder_mcep.pad_left outpad_rights[0] = pad_right-model_encoder_mcep.pad_right outpad_lefts[1] = outpad_lefts[0]-model_decoder_mcep.pad_left outpad_rights[1] = outpad_rights[0]-model_decoder_mcep.pad_right outpad_lefts[2] = outpad_lefts[1]-model_encoder_mcep.pad_left outpad_rights[2] = outpad_rights[1]-model_encoder_mcep.pad_right for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2) if config.ar_enc: spk_logits, lat_src, _, _ = model_encoder_mcep(feat, yz_in=yz_in) spk_logits_e, lat_src_e, _, _ = model_encoder_excit(feat, yz_in=yz_in) else: spk_logits, lat_src, _ = model_encoder_mcep(feat) spk_logits_e, lat_src_e, _ = model_encoder_excit(feat) idx_vq = nn_search_batch(lat_src, model_vq.weight) lat_src = model_vq(idx_vq) if outpad_rights[0] > 0: unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) idx_vq_e = nn_search_batch(lat_src_e, model_vq.weight) lat_src_e = model_vq(idx_vq_e) if outpad_rights[0] > 0: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq_e") logging.info(dict(zip(unique, counts))) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:], dim=-1), 1)) src_code = (torch.ones((1, lat_src.shape[1]))*spk_idx).cuda().long() if config.ar_dec: cvmcep_src, _, _ = model_decoder_mcep(src_code, lat_src, x_in=x_in) else: cvmcep_src, _ = model_decoder_mcep(src_code, lat_src) if config.ar_f0: cvlf0_src, _, _ = model_decoder_excit(src_code, lat_src_e, e_in=e_in) else: cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) cv_feat = torch.cat((cvlf0_src, cvmcep_src), 2) if config.ar_enc: spk_logits, lat_rec, _, _ = model_encoder_mcep(cv_feat, yz_in=yz_in) spk_logits_e, lat_rec_e, _, _ = model_encoder_excit(cv_feat, yz_in=yz_in) else: spk_logits, lat_rec, _ = model_encoder_mcep(cv_feat) spk_logits_e, lat_rec_e, _ = model_encoder_excit(cv_feat) idx_vq = nn_search_batch(lat_rec, model_vq.weight) lat_rec = model_vq(idx_vq) if outpad_rights[2] > 0: unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) idx_vq_e = nn_search_batch(lat_rec_e, model_vq.weight) lat_rec_e = model_vq(idx_vq_e) if outpad_rights[2] > 0: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True) else: unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("input vq_e") logging.info(dict(zip(unique, counts))) logging.info('rec spkpost') if outpad_rights[2] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[2] > 0: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:], dim=-1), 1)) src_code = (torch.ones((1, lat_rec.shape[1]))*spk_idx).cuda().long() if config.ar_dec: cvmcep_cyc, _, _ = model_decoder_mcep(src_code, lat_rec, x_in=x_in) else: cvmcep_cyc, _ = model_decoder_mcep(src_code, lat_rec) if config.ar_f0: cvlf0_cyc, _, _ = model_decoder_excit(src_code, lat_rec_e, e_in=e_in) else: cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if outpad_rights[1] > 0: cvmcep_src = cvmcep_src[:,outpad_lefts[1]:-outpad_rights[1]] cvlf0_src = cvlf0_src[:,outpad_lefts[1]:-outpad_rights[1]] else: cvmcep_src = cvmcep_src[:,outpad_lefts[1]:] cvlf0_src = cvlf0_src[:,outpad_lefts[1]:] feat_rec = torch.cat((torch.round(cvlf0_src[:,:,:1]), cvlf0_src[:,:,1:2], \ torch.round(cvlf0_src[:,:,2:3]), cvlf0_src[:,:,3:], cvmcep_src), \ 2)[0].cpu().data.numpy() feat_cyc = torch.cat((torch.round(cvlf0_cyc[:,:,:1]), cvlf0_cyc[:,:,1:2], \ torch.round(cvlf0_cyc[:,:,2:3]), cvlf0_cyc[:,:,3:], cvmcep_cyc), \ 2)[0].cpu().data.numpy() cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmcep_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmcep_cyc.shape) mcep = np.array(feat_org[:,-model_decoder_mcep.out_dim:]) f0 = np.array(np.rint(feat_org[:,0])*np.exp(feat_org[:,1])) codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:feat_org.shape[-1]-model_decoder_mcep.out_dim]))) cvf0_src = np.array(np.rint(cvlf0_src[:,0])*np.exp(cvlf0_src[:,1])) cvcodeap_src = np.array(np.rint(cvlf0_src[:,2:3])*(-np.exp(cvlf0_src[:,3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src-f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt(np.mean((cvcodeap_src-codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i+1, codeap_rmse[i])) cvf0_cyc = np.array(np.rint(cvlf0_cyc[:,0])*np.exp(cvlf0_cyc[:,1])) cvcodeap_cyc = np.array(np.rint(cvlf0_cyc[:,2:3])*(-np.exp(cvlf0_cyc[:,3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc-f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt(np.mean((cvcodeap_cyc-codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i+1, codeap_rmse_cyc[i])) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) mcdpow_cvlist.append(mcdpow_mean) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:,1:], axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:,1:], axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:,1:], axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:,1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk+"-"+args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim + config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=config.bi_enc, cont=False, pad_first=True, right_size=config.right_size, ar=config.ar_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, pad_first=True, ar=config.ar_dec) logging.info(model_decoder) model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim) logging.info(model_vq) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_vq.load_state_dict(torch.load(args.model)["model_vq"]) model_encoder.cuda() model_decoder.cuda() model_vq.cuda() model_encoder.eval() model_decoder.eval() model_vq.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_vq.parameters(): param.requires_grad = False if config.ar_enc: yz_in = torch.zeros((1, 1, n_spk + config.lat_dim)).cuda() if config.ar_dec: mean_stats = torch.FloatTensor( read_hdf5( config.stats, "/mean_" + config.string_path.replace("/", ""))) scale_stats = torch.FloatTensor( read_hdf5( config.stats, "/scale_" + config.string_path.replace("/", ""))) x_in = ((torch.zeros((1, 1, config.mcep_dim)) - mean_stats[config.excit_dim:]) / scale_stats[config.excit_dim:]).cuda() count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left) * 2 pad_right = (model_encoder.pad_right + model_decoder.pad_right) * 2 outpad_lefts = [None] * 3 outpad_rights = [None] * 3 outpad_lefts[0] = pad_left - model_encoder.pad_left outpad_rights[0] = pad_right - model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1] - model_encoder.pad_left outpad_rights[2] = outpad_rights[1] - model_encoder.pad_right for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) mcep = np.array(feat_org[:, -model_decoder.out_dim:]) with torch.no_grad(): feat = torch.FloatTensor(feat_org).cuda().unsqueeze(0) feat_excit = feat[:, :, :config.excit_dim] if config.ar_enc: spk_logits, lat_src, _, _ = model_encoder(F.pad(feat.transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2), \ yz_in=yz_in) else: spk_logits, lat_src, _ = model_encoder( F.pad(feat.transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2)) idx_vq = nn_search_batch(lat_src, model_vq.weight) lat_src = model_vq(idx_vq) if outpad_rights[0] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:-outpad_rights[0]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[0]:].cpu().data.numpy(), return_counts=True) logging.info("input vq") logging.info(dict(zip(unique, counts))) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_src.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_src, _, _ = model_decoder(src_code, lat_src, x_in=x_in) else: cvmcep_src, _ = model_decoder(src_code, lat_src) if config.ar_enc: spk_logits, lat_rec, _, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2), yz_in=yz_in) else: spk_logits, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2)) idx_vq = nn_search_batch(lat_rec, model_vq.weight) lat_rec = model_vq(idx_vq) if outpad_rights[2] > 0: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:-outpad_rights[2]].cpu( ).data.numpy(), return_counts=True) else: unique, counts = np.unique( idx_vq[:, outpad_lefts[2]:].cpu().data.numpy(), return_counts=True) logging.info("rec vq") logging.info(dict(zip(unique, counts))) logging.info('rec spkpost') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]:], dim=-1), 1)) src_code = (torch.ones( (1, lat_rec.shape[1])) * spk_idx).cuda().long() if config.ar_dec: cvmcep_cyc, _, _ = model_decoder(src_code, lat_rec, x_in=x_in) else: cvmcep_cyc, _ = model_decoder(src_code, lat_rec) if outpad_rights[1] > 0: cvmcep_src = cvmcep_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvmcep_src = cvmcep_src[:, outpad_lefts[1]:] feat_rec = torch.cat((feat_excit, cvmcep_src), 2)[0].cpu().data.numpy() feat_cyc = torch.cat((feat_excit, cvmcep_cyc), 2)[0].cpu().data.numpy() cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep_src.shape) logging.info(cvmcep_cyc.shape) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist)) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist_dv)) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def decode_RNN(feat_list, gpu, cvlist=None, mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\ mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\ mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \ lat_dist_rmse_list=None, lat_dist_cosim_list=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim+config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder) model_post = GRU_POST_NET( spec_dim=config.mcep_dim, excit_dim=2, n_spk=n_spk, hidden_layers=config.hidden_layers_post, hidden_units=config.hidden_units_post, kernel_size=config.kernel_size_post, dilation_size=config.dilation_size_post, causal_conv=config.causal_conv_post, pad_first=True, right_size=config.right_size_post) #excit_dim=config.excit_dim, #excit_dim=None, logging.info(model_post) model_encoder.load_state_dict(torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict(torch.load(args.model)["model_decoder"]) model_post.load_state_dict(torch.load(args.model)["model_post"]) model_encoder.remove_weight_norm() model_decoder.remove_weight_norm() model_post.remove_weight_norm() model_encoder.cuda() model_decoder.cuda() model_post.cuda() model_encoder.eval() model_decoder.eval() model_post.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_post.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2 pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2 outpad_lefts = [None]*5 outpad_rights = [None]*5 outpad_lefts[0] = pad_left-model_encoder.pad_left outpad_rights[0] = pad_right-model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left outpad_rights[2] = outpad_rights[1]-model_post.pad_right outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right logging.info(f'{pad_left} {pad_right}') logging.info(outpad_lefts) logging.info(outpad_rights) for feat_file in feat_list: # convert mcep spk_src = os.path.basename(os.path.dirname(feat_file)) src_idx = spk_list.index(spk_src) logging.info('%s --> %s' % (spk_src, args.spk_trg)) file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file)) trg_exist = False if os.path.exists(file_trg): logging.info('exist: %s' % (file_trg)) feat_trg = read_hdf5(file_trg, config.string_path) mcep_trg = feat_trg[:,-config.mcep_dim:] logging.info(mcep_trg.shape) trg_exist = True feat_org = read_hdf5(feat_file, config.string_path) mcep = np.array(feat_org[:,-config.mcep_dim:]) codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim]))) sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl)) ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl) feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1) feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]] logging.info("generate") with torch.no_grad(): feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2) feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0) feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0) spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1)) if trg_exist: spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \ (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False) logging.info('target spkpost') logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1)) cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src) cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src) cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(), e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), sampling=False) logging.info('cv spkpost') if outpad_rights[3] > 0: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1)) else: logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1)) cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv) cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(), e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) if outpad_rights[2] > 0: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64) else: cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64) if trg_exist: if outpad_rights[0] > 0: lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]] else: lat_src = lat_src[:,outpad_lefts[0]:] logging.info(cvmcep_src.shape) logging.info(cvmcep.shape) logging.info(cvmcep_cyc.shape) if trg_exist: logging.info(lat_src.shape) logging.info(lat_trg.shape) cvlist.append(np.var(cvmcep[:,1:], axis=0)) logging.info("cvf0lin") f0_range = read_hdf5(feat_file, "/f0_range") cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std) uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin)) unique, counts = np.unique(uv_range_lin, return_counts=True) logging.info(dict(zip(unique, counts))) cont_f0_lpf_range_lin = \ low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20) uv_range_lin = np.expand_dims(uv_range_lin, axis=-1) cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1) # plain converted feat for neural vocoder feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep] logging.info(feat_cv.shape) logging.info("mcd acc") spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_src.append(mcdpow_mean) mcdpowstd_cvlist_src.append(mcdpow_std) mcd_cvlist_src.append(mcd_mean) mcdstd_cvlist_src.append(mcd_std) if trg_exist: spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0]) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \ dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) spcidx_src = torch.LongTensor(spcidx).cuda() spcidx_trg = torch.LongTensor(spcidx_trg).cuda() trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64) trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64) aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg) lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0))) _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0) aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src) lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0))) _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0) logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc)) lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2 lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2 lat_dist_rmse_list.append(lat_dist_rmse) lat_dist_cosim_list.append(lat_dist_cosim) logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim)) _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64)) _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) mcdpow_cvlist_cyc.append(mcdpow_mean) mcdpowstd_cvlist_cyc.append(mcdpow_std) mcd_cvlist_cyc.append(mcd_mean) mcdstd_cvlist_cyc.append(mcd_std) logging.info("synth anasyn") wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco rec") cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl) logging.info(cvsp_src.shape) wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv") cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl) logging.info(cvsp.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) logging.info("synth voco cv GV") datamean = np.mean(cvmcep[:,1:], axis=0) cvmcep_gv = np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \ (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]] cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN) cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl) logging.info(cvsp_gv.shape) wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1) wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav")) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info(wavpath) #logging.info("synth diffGV") #shiftl = int(args.fs/1000*args.shiftms) #mc_cv_diff = cvmcep_gv-mcep #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha) #logging.info(b.shape) #assert np.isfinite(b).all #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl) #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav"))) #assert(fs_ == args.fs) #wav = mlsa_fil.synthesis(x, b) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("synth diffGVF0") #time_axis = read_hdf5(feat_file, "/time_axis") #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff.shape) #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl) #logging.info(ap_diff.shape) #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms) #wav = np.clip(wav, -1, 1) #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav")) #sf.write(wavpath, wav, args.fs, 'PCM_16') #logging.info(wavpath) #logging.info("analysis diffGVF0") #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #logging.info(sp_diff_anasyn.shape) #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha) #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl) #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs) ## convert to continouos codeap with uv #for i in range(code_ap_diff_anasyn.shape[-1]): # logging.info('codeap: %d' % (i+1)) # uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i])) # cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP)) # if i > 0: # cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)] # else: # uv_codeap = np.expand_dims(uv_codeap_i, axis=-1) # cont_codeap = np.expand_dims(cont_codeap_i, axis=-1) # uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1) # unique, counts = np.unique(uv_codeap_i, return_counts=True) # logging.info(dict(zip(unique, counts))) ## postprocessed converted feat for neural vocoder #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn] #logging.info("write lat") #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file))) #if not os.path.exists(outTxtDir): # os.mkdir(outTxtDir) #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt")) #logging.info(outTxt) #g = open(outTxt, "wt") #idx_frm = 0 #nfrm = trj_lat_src.shape[0] #dim = trj_lat_src.shape[1] #if not args.time_flag: ##if True: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # g.write("%lf " % (elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #else: # while idx_frm < nfrm: # idx_elmt = 1 # for elmt in trj_lat_src[idx_frm]: # if idx_elmt < dim: # if idx_elmt > 1: # g.write("%lf " % (elmt)) # else: # g.write("%lf %lf " % (time_axis[idx_frm], elmt)) # else: # g.write("%lf\n" % (elmt)) # idx_elmt += 1 # idx_frm += 1 #g.close() logging.info('write to h5') outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) # cv write_path = args.string_path logging.info(feat_file + ' ' + write_path) logging.info(feat_cv.shape) write_hdf5(feat_file, write_path, feat_cv) ## diffGVF0 #write_path = args.string_path+"_diffgvf0" #logging.info(feat_file + ' ' + write_path) #logging.info(feat_diffgv_anasyn.shape) #write_hdf5(feat_file, write_path, feat_diffgv_anasyn) count += 1
def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \ mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \ mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \ cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \ mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \ mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder = GRU_VAE_ENCODER( in_dim=config.mcep_dim + config.excit_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder) model_decoder = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder) model_post = GRU_POST_NET( spec_dim=config.mcep_dim, excit_dim=2, n_spk=n_spk, hidden_layers=config.hidden_layers_post, hidden_units=config.hidden_units_post, kernel_size=config.kernel_size_post, dilation_size=config.dilation_size_post, causal_conv=config.causal_conv_post, pad_first=True, right_size=config.right_size_post) #excit_dim=config.excit_dim, #excit_dim=None, logging.info(model_post) model_encoder.load_state_dict( torch.load(args.model)["model_encoder"]) model_decoder.load_state_dict( torch.load(args.model)["model_decoder"]) model_post.load_state_dict( torch.load(args.model)["model_post"]) model_encoder.remove_weight_norm() model_decoder.remove_weight_norm() model_post.remove_weight_norm() model_encoder.cuda() model_decoder.cuda() model_post.cuda() model_encoder.eval() model_decoder.eval() model_post.eval() for param in model_encoder.parameters(): param.requires_grad = False for param in model_decoder.parameters(): param.requires_grad = False for param in model_post.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left) * 2 pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder.pad_left outpad_rights[0] = pad_right - model_encoder.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right outpad_lefts[2] = outpad_lefts[1] - model_post.pad_left outpad_rights[2] = outpad_rights[1] - model_post.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder.pad_right logging.info(f'{pad_left} {pad_right}') logging.info(outpad_lefts) logging.info(outpad_rights) for feat_file in feat_list: # convert mcep logging.info("recmcep " + feat_file) feat_org = read_hdf5(feat_file, "/feat_mceplf0cap") logging.info(feat_org.shape) mcep = np.array(feat_org[:, -config.mcep_dim:]) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) feat_excit = torch.FloatTensor( feat_org[:, :config.excit_dim]).cuda().unsqueeze(0) spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) cvmcep_src, _ = model_decoder((torch.ones( (1, lat_src.shape[1])) * spk_idx).cuda().long(), lat_src) cvmcep_src_post, _ = model_post( cvmcep_src, y=(torch.ones( (1, cvmcep_src.shape[1])) * spk_idx).cuda().long(), e=F.pad(feat_excit[:, :, :2].transpose(1, 2), (outpad_lefts[1], outpad_rights[1]), "replicate").transpose(1, 2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2)) if model_post.pad_right > 0: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), sampling=False) else: spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \ (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) cvmcep_cyc, _ = model_decoder((torch.ones( (1, lat_rec.shape[1])) * spk_idx).cuda().long(), lat_rec) cvmcep_cyc_post, _ = model_post( cvmcep_cyc, y=(torch.ones( (1, cvmcep_cyc.shape[1])) * spk_idx).cuda().long(), e=F.pad(feat_excit[:, :, :2].transpose(1, 2), (outpad_lefts[4], outpad_rights[4]), "replicate").transpose(1, 2)) #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2)) if outpad_rights[2] > 0: feat_rec = torch.cat( (feat_excit, cvmcep_src_post[:, outpad_lefts[2]:-outpad_rights[2]] ), 2)[0].cpu().data.numpy() else: feat_rec = torch.cat( (feat_excit, cvmcep_src_post[:, outpad_lefts[2]:]), 2)[0].cpu().data.numpy() feat_cyc = torch.cat((feat_excit, cvmcep_cyc_post), 2)[0].cpu().data.numpy() if outpad_rights[2] > 0: cvmcep_src = np.array( cvmcep_src_post[:, outpad_lefts[2]:-outpad_rights[2]] [0].cpu().data.numpy(), dtype=np.float64) else: cvmcep_src = np.array(cvmcep_src_post[:, outpad_lefts[2]:] [0].cpu().data.numpy(), dtype=np.float64) cvmcep_cyc = np.array( cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvmcep_src.shape) logging.info(cvmcep_cyc.shape) spcidx = read_hdf5(feat_file, "/spcidx_range")[0] _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean = np.mean(mcdpow_arr) mcdpow_std = np.std(mcdpow_arr) mcd_mean = np.mean(mcd_arr) mcd_std = np.std(mcd_arr) logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std)) logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std)) _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64)) _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \ dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64)) mcdpow_mean_cyc = np.mean(mcdpow_arr) mcdpow_std_cyc = np.std(mcdpow_arr) mcd_mean_cyc = np.mean(mcd_arr) mcd_std_cyc = np.std(mcd_arr) logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc)) logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc)) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') mcdpow_cvlist.append(mcdpow_mean) mcdpowstd_cvlist.append(mcdpow_std) mcd_cvlist.append(mcd_mean) mcdstd_cvlist.append(mcd_std) cvlist.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist)) mcdpow_cvlist_cyc.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc) mcd_cvlist_cyc.append(mcd_mean_cyc) mcdstd_cvlist_cyc.append(mcd_std_cyc) cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0)) elif 'dv' in dataset: logging.info('dev') mcdpow_cvlist_dv.append(mcdpow_mean) mcdpowstd_cvlist_dv.append(mcdpow_std) mcd_cvlist_dv.append(mcd_mean) mcdstd_cvlist_dv.append(mcd_std) cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0)) logging.info(len(cvlist_dv)) mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc) mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc) mcd_cvlist_cyc_dv.append(mcd_mean_cyc) mcdstd_cvlist_cyc_dv.append(mcd_std_cyc) cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def gpu_decode(feat_list, gpu, cvlist=None, lsd_cvlist=None, lsdstd_cvlist=None, cvlist_dv=None, lsd_cvlist_dv=None, lsdstd_cvlist_dv=None, f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, cvlist_cyc=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, cvlist_cyc_dv=None, lsd_cvlist_cyc_dv=None, lsdstd_cvlist_cyc_dv=None, f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, excit_dim=config.excit_dim, out_dim=config.mel_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=False, ar=False, pad_first=True, right_size=config.right_size_lf0) logging.info(model_decoder_excit) if (config.spkidtr_dim > 0): model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict( torch.load(args.model)["model_decoder_excit"]) if (config.spkidtr_dim > 0): model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() if (config.spkidtr_dim > 0): model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_decoder_excit.eval() if (config.spkidtr_dim > 0): model_spkidtr.eval() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False if (config.spkidtr_dim > 0): for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_excit.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_excit.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_excit.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_excit.pad_right outpad_lefts[2] = outpad_lefts[1] - model_decoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_decoder_melsp.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder_melsp.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder_melsp.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder_excit.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder_excit.pad_right for feat_file in feat_list: # reconst. melsp logging.info("recmelsp " + feat_file) feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long() cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_src_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_src[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_src_e[:, model_decoder_excit.pad_left:], lat_src[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_src, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_src[:, :, :config.excit_dim]) spk_logits, _, lat_rec, _ = model_encoder_melsp( cvmelsp_src, sampling=False) spk_logits_e, _, lat_rec_e, _ = model_encoder_excit( cvmelsp_src, sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long() cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_rec_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_rec[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_rec_e[:, model_decoder_excit.pad_left:], lat_rec[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_cyc, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_cyc[:, :, :config.excit_dim]) if outpad_rights[1] > 0: cvlf0_src = cvlf0_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvlf0_src = cvlf0_src[:, outpad_lefts[1]:] if outpad_rights[2] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]: -outpad_rights[2]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:] if outpad_rights[4] > 0: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]: -outpad_rights[4]] else: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:] feat_rec = cvmelsp_src[0].cpu().data.numpy() feat_cyc = cvmelsp_cyc[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmelsp_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) feat_world = read_hdf5(feat_file, "/feat_mceplf0cap") f0 = np.array( np.rint(feat_world[:, 0]) * np.exp(feat_world[:, 1])) codeap = np.array( np.rint(feat_world[:, 2:3]) * (-np.exp(feat_world[:, 3:config.full_excit_dim]))) cvf0_src = np.array( np.rint(cvlf0_src[:, 0]) * np.exp(cvlf0_src[:, 1])) cvcodeap_src = np.array( np.rint(cvlf0_src[:, 2:3]) * (-np.exp(cvlf0_src[:, 3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src - f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt( np.mean((cvcodeap_src - codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i + 1, codeap_rmse[i])) cvf0_cyc = np.array( np.rint(cvlf0_cyc[:, 0]) * np.exp(cvlf0_cyc[:, 1])) cvcodeap_cyc = np.array( np.rint(cvlf0_cyc[:, 2:3]) * (-np.exp(cvlf0_cyc[:, 3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc - f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt( np.mean((cvcodeap_cyc - codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i + 1, codeap_rmse_cyc[i])) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_rec: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) cvlist.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) cvlist_cyc.append(np.var(melsp_cyc_rest, axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) lsd_cvlist_dv.append(lsd_mean) lsdstd_cvlist_dv.append(lsd_std) cvlist_dv.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) lsd_cvlist_cyc_dv.append(lsd_mean_cyc) lsdstd_cvlist_cyc_dv.append(lsd_std_cyc) cvlist_cyc_dv.append(np.var(melsp_cyc_rest, axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def main(): parser = argparse.ArgumentParser() # decode setting parser.add_argument("--model", required=True, type=str, help="GRU_RNN model file") parser.add_argument("--config", required=True, type=str, help="GRU_RNN configure file") parser.add_argument("--outdir", required=True, type=str, help="directory to save generated samples") # other setting #parser.add_argument("--GPU_device", default=None, # type=int, help="selection of GPU device") #parser.add_argument("--GPU_device_str", default=None, # type=str, help="selection of GPU device") parser.add_argument("--verbose", default=VERBOSE, type=int, help="log level") args = parser.parse_args() #if args.GPU_device is not None or args.GPU_device_str is not None: # os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" # if args.GPU_device_str is None: # os.environ["CUDA_VISIBLE_DEVICES"] = str(args.GPU_device) # else: # os.environ["CUDA_VISIBLE_DEVICES"] = args.GPU_device_str os.environ["CUDA_VISIBLE_DEVICES"] = "" # check directory existence if not os.path.exists(args.outdir): os.makedirs(args.outdir) # set log level if args.verbose > 0: logging.basicConfig( level=logging.INFO, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) elif args.verbose > 1: logging.basicConfig( level=logging.DEBUG, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) else: logging.basicConfig( level=logging.WARN, format= '%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s', datefmt='%m/%d/%Y %I:%M:%S', filemode='w', filename=args.outdir + "/decode.log") logging.getLogger().addHandler(logging.StreamHandler()) logging.warn("logging is disabled.") # load config config = torch.load(args.config) spk_list = config.spk_list.split('@') n_spk = len(spk_list) model_epoch = os.path.basename(args.model).split('.')[0].split('-')[1] logging.info('epoch: ' + model_epoch) device = torch.device("cpu") #with torch.cuda.device(0): # define model and load parameters with torch.no_grad(): model_decoder_mcep = GRU_SPEC_DECODER( feat_dim=config.lat_dim, out_dim=config.mcep_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_dec) logging.info(model_decoder_mcep) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=config.bi_dec, spkidtr_dim=config.spkidtr_dim, pad_first=True, ar=config.ar_dec) logging.info(model_decoder_excit) model_decoder_mcep.load_state_dict( torch.load(args.model, map_location=device)["model_decoder_mcep"]) model_decoder_excit.load_state_dict( torch.load(args.model, map_location=device)["model_decoder_excit"]) #model_decoder_mcep.cuda() #model_decoder_excit.cuda() model_decoder_mcep.eval() model_decoder_excit.eval() for param in model_decoder_mcep.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False #feat = torch.LongTensor(np.arange(n_spk)).cuda().unsqueeze(0) feat = torch.LongTensor(np.arange(n_spk)).unsqueeze(0) logging.info(feat) logging.info(spk_list) colormap = np.array(['b', 'r']) male = ['bdl', 'p237', 'p245', 'p251', 'p252', 'p259', 'p274', 'p304', 'p311', 'p326', 'p345', 'p360', 'p363', \ 'SEM1', 'SEM2', 'TFM1', 'TGM1', 'TMM1', 'TEM1', 'TEM2', \ 'VCC2SM1', 'VCC2SM2', 'VCC2SM3', 'VCC2TM1', 'VCC2TM2', 'VCC2SM4'] female = ['slt', 'p231', 'p238', 'p248', 'p253', 'p264', 'p265', 'p266', 'p276', 'p305', 'p308', 'p318', 'p335', \ 'SEF1', 'SEF2', 'TEF1', 'TEF2', 'TFF1', 'TGF1', 'TMF1', \ 'VCC2SF1', 'VCC2SF2', 'VCC2SF3', 'VCC2TF1', 'VCC2TF2', 'VCC2SF4'] gender = [] for i in range(len(spk_list)): if spk_list[i] in male: gender.append(0) elif spk_list[i] in female: gender.append(1) else: logging.info('error %s not in gender list' % (spk_list[i])) exit() z = model_decoder_mcep.spkidtr_conv( F.one_hot(feat, num_classes=n_spk).float().transpose(1, 2)).transpose(1, 2) #z_rec = model_decoder_mcep.spkidtr_deconv(z.transpose(1,2)).transpose(1,2) logging.info(z) logging.info(args.outdir) #plt.rcParams["figure.figsize"] = (20,11.25) #1920x1080 plt.rcParams["figure.figsize"] = (11.25, 11.25) #1080x1080 #plt.rcParams["figure.figsize"] = (14.229166667,14.229166667) #1366x1366 #z = z.cpu().data.numpy() z = z.data.numpy() logging.info(z.shape) x = z[0, :, 0] y = z[0, :, 1] fig, ax = plt.subplots() ax.scatter(x, y, s=40, c=colormap[gender]) for i, txt in enumerate(spk_list): ax.annotate(txt, (x[i], y[i])) plt.savefig(os.path.join(args.outdir, 'spect.png')) plt.close() z_e = model_decoder_excit.spkidtr_conv( F.one_hot(feat, num_classes=n_spk).float().transpose(1, 2)).transpose(1, 2) #z_e_rec = model_decoder_excit.spkidtr_deconv(z_e.transpose(1,2)).transpose(1,2) logging.info(z_e) #z_e = z_e.cpu().data.numpy() z_e = z_e.data.numpy() x = z_e[0, :, 0] y = z_e[0, :, 1] fig, ax = plt.subplots() ax.scatter(x, y, s=40, c=colormap[gender]) for i, txt in enumerate(spk_list): ax.annotate(txt, (x[i], y[i])) plt.savefig(os.path.join(args.outdir, 'excit.png')) plt.close()
def decode_RNN(feat_list, gpu, cvlist=None, lsd_cvlist_src=None, lsdstd_cvlist_src=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, lsd_cvlist=None, lsdstd_cvlist=None, lat_dist_rmse_list=None, lat_dist_cosim_list=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, out_dim=config.mel_dim, n_spk=(config.emb_spk_dim // config.n_weight_emb) * config.n_weight_emb, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, pad_first=True, right_size=config.right_size_dec, red_dim_upd=config.mel_dim, pdf_gauss=True) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, emb_dim=config.emb_spk_dim, n_weight_emb=config.n_weight_emb, conv_emb_flag=True, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_spkidtr.eval() model_encoder_melsp.remove_weight_norm() model_decoder_melsp.remove_weight_norm() model_encoder_excit.remove_weight_norm() model_spkidtr.remove_weight_norm() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 3 outpad_rights = [None] * 3 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_melsp.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_melsp.pad_right outpad_lefts[2] = outpad_lefts[1] - model_encoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_encoder_melsp.pad_right melfb_t = np.linalg.pinv( librosa.filters.mel(args.fs, args.fftl, n_mels=config.mel_dim)) temp = 0.675 logging.info(f'temp: {temp}') for feat_file in feat_list: # convert melsp spk_src = os.path.basename(os.path.dirname(feat_file)) logging.info('%s --> %s' % (spk_src, args.spk_trg)) file_trg = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file)) trg_exist = False if os.path.exists(file_trg): logging.info('exist: %s' % (file_trg)) feat_trg = read_hdf5(file_trg, "/log_1pmelmagsp") logging.info(feat_trg.shape) trg_exist = True feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) logging.info("generate") with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if trg_exist: spk_trg_logits, _, lat_trg, _ = model_encoder_melsp( F.pad( torch.FloatTensor(feat_trg).cuda().unsqueeze( 0).transpose(1, 2), (model_encoder_melsp.pad_left, model_encoder_melsp.pad_right), "replicate").transpose(1, 2), sampling=False) spk_trg_logits_e, _, lat_trg_e, _ = model_encoder_excit( F.pad( torch.FloatTensor(feat_trg).cuda().unsqueeze( 0).transpose(1, 2), (model_encoder_excit.pad_left, model_encoder_excit.pad_right), "replicate").transpose(1, 2), sampling=False) logging.info('target spkpost') logging.info( torch.mean(F.softmax(spk_trg_logits, dim=-1), 1)) logging.info('target spkpost_e') logging.info( torch.mean(F.softmax(spk_trg_logits_e, dim=-1), 1)) _, src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * src_idx).cuda().long()) _, trg_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * trg_idx).cuda().long()) lat_cat = torch.cat((lat_src_e, lat_src), 2) _, cvmelsp_src, _ = model_decoder_melsp(lat_cat, y=src_code, temp=temp) _, cvmelsp, _ = model_decoder_melsp(lat_cat, y=trg_code, temp=temp) trj_lat_cat = lat_cat_src = lat_cat spk_logits, _, lat_cv, _ = model_encoder_melsp( cvmelsp, sampling=False) spk_logits_e, _, lat_cv_e, _ = model_encoder_excit( cvmelsp, sampling=False) logging.info('cv spkpost') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]:], dim=-1), 1)) logging.info('cv spkpost_e') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[2]:], dim=-1), 1)) _, src_code = model_spkidtr((torch.ones( (1, lat_cv_e.shape[1])) * src_idx).cuda().long()) lat_cat = torch.cat((lat_cv_e, lat_cv), 2) _, cvmelsp_cyc, _ = model_decoder_melsp(lat_cat, y=src_code, temp=temp) #if outpad_rights[0] > 0: # trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]] #else: # trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]] if outpad_rights[1] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]: -outpad_rights[1]] cvmelsp = cvmelsp[:, outpad_lefts[1]:-outpad_rights[1]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:] cvmelsp = cvmelsp[:, outpad_lefts[1]:] feat_cv = cvmelsp[0].cpu().data.numpy() #feat_lat = trj_lat_cat[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp = np.array(cvmelsp[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) if trg_exist: if outpad_rights[1] > 0: lat_src = lat_cat_src[:, outpad_lefts[0]: -outpad_rights[0]] else: lat_src = lat_cat_src[:, outpad_lefts[0]:] lat_trg = torch.cat((lat_trg_e, lat_trg), 2) logging.info(cvmelsp_src.shape) logging.info(cvmelsp.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) if trg_exist: logging.info(lat_src.shape) logging.info(lat_trg.shape) melsp_trg = np.array(feat_trg) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_cv_rest = (np.exp(cvmelsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 cvlist.append(np.var(melsp_cv_rest, axis=0)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_src_cv: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_cvlist_src.append(lsd_mean) lsdstd_cvlist_src.append(lsd_std) if trg_exist: melsp_trg_rest = (np.exp(melsp_trg) - 1) / 10000 spcidx_trg = np.array( read_hdf5(file_trg, "/spcidx_range")[0]) _, twf_melsp, _, _ = dtw.dtw_org_to_trg(np.array(melsp_cv_rest[spcidx], \ dtype=np.float64), np.array(melsp_trg_rest[spcidx_trg], dtype=np.float64), mcd=-1) twf_melsp = np.array(twf_melsp[:, 0]) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cv_rest[twf_melsp], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[twf_melsp], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_trg: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) spcidx_src = torch.LongTensor(spcidx).cuda() spcidx_trg = torch.LongTensor(spcidx_trg).cuda() trj_lat_src = np.array(torch.index_select( lat_src[0], 0, spcidx_src).cpu().data.numpy(), dtype=np.float64) trj_lat_trg = np.array(torch.index_select( lat_trg[0], 0, spcidx_trg).cpu().data.numpy(), dtype=np.float64) aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg( trj_lat_src, trj_lat_trg) lat_dist_srctrg = np.mean( np.sqrt( np.mean((aligned_lat_srctrg - trj_lat_trg)**2, axis=0))) _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0) aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg( trj_lat_trg, trj_lat_src) lat_dist_trgsrc = np.mean( np.sqrt( np.mean((aligned_lat_trgsrc - trj_lat_src)**2, axis=0))) _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0) logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc)) lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2 lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2 lat_dist_rmse_list.append(lat_dist_rmse) lat_dist_cosim_list.append(lat_dist_cosim) logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) logging.info("synth anasyn") magsp = np.matmul(melfb_t, melsp_rest.T) logging.info(magsp.shape) hop_length = int((args.fs / 1000) * args.shiftms) win_length = int((args.fs / 1000) * args.winms) wav = np.clip( librosa.core.griffinlim(magsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_anasyn.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') #if trg_exist: # logging.info("synth anasyn_trg") # wav = np.clip(pw.synthesize(f0_trg, sp_trg, ap_trg, fs, frame_period=args.shiftms), -1, 1) # wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn_trg.wav")) # sf.write(wavpath, wav, fs, 'PCM_16') # logging.info(wavpath) logging.info("synth gf rec") recmagsp = np.matmul(melfb_t, melsp_src_rest.T) logging.info(recmagsp.shape) wav = np.clip( librosa.core.griffinlim(recmagsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info("synth gf cv") cvmagsp = np.matmul(melfb_t, melsp_cv_rest.T) logging.info(cvmagsp.shape) wav = np.clip( librosa.core.griffinlim(cvmagsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info('write to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), spk_src + "-" + args.spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) # cv write_path = args.string_path logging.info(feat_file + ' ' + write_path) logging.info(feat_cv.shape) write_hdf5(feat_file, write_path, feat_cv) #logging.info('write lat to h5') #logging.info(feat_file + ' ' + args.string_path+'_lat') #logging.info(feat_lat.shape) #write_hdf5(feat_file, args.string_path+'_lat', feat_lat) count += 1