def gpu_decode(feat_list, gpu, cvlist=None, lsd_cvlist=None, lsdstd_cvlist=None, cvlist_dv=None, lsd_cvlist_dv=None, lsdstd_cvlist_dv=None, f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, cvlist_cyc=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, cvlist_cyc_dv=None, lsd_cvlist_cyc_dv=None, lsdstd_cvlist_cyc_dv=None, f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, excit_dim=config.excit_dim, out_dim=config.mel_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, bi=False, ar=False, pad_first=True, right_size=config.right_size_dec) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, bi=False, ar=False, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_decoder_excit = GRU_EXCIT_DECODER( feat_dim=config.lat_dim_e, cap_dim=config.cap_dim, n_spk=n_spk, hidden_layers=config.hidden_layers_lf0, hidden_units=config.hidden_units_lf0, kernel_size=config.kernel_size_lf0, dilation_size=config.dilation_size_lf0, causal_conv=config.causal_conv_lf0, bi=False, ar=False, pad_first=True, right_size=config.right_size_lf0) logging.info(model_decoder_excit) if (config.spkidtr_dim > 0): model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_decoder_excit.load_state_dict( torch.load(args.model)["model_decoder_excit"]) if (config.spkidtr_dim > 0): model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_decoder_excit.cuda() if (config.spkidtr_dim > 0): model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_decoder_excit.eval() if (config.spkidtr_dim > 0): model_spkidtr.eval() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_decoder_excit.parameters(): param.requires_grad = False if (config.spkidtr_dim > 0): for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_excit.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_excit.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 5 outpad_rights = [None] * 5 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_excit.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_excit.pad_right outpad_lefts[2] = outpad_lefts[1] - model_decoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_decoder_melsp.pad_right outpad_lefts[3] = outpad_lefts[2] - model_encoder_melsp.pad_left outpad_rights[3] = outpad_rights[2] - model_encoder_melsp.pad_right outpad_lefts[4] = outpad_lefts[3] - model_decoder_excit.pad_left outpad_rights[4] = outpad_rights[3] - model_decoder_excit.pad_right for feat_file in feat_list: # reconst. melsp logging.info("recmelsp " + feat_file) feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_src_e.shape[1])) * spk_idx).cuda().long() cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_src_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_src[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_src_e[:, model_decoder_excit.pad_left:], lat_src[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_src, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_src[:, :, :config.excit_dim]) spk_logits, _, lat_rec, _ = model_encoder_melsp( cvmelsp_src, sampling=False) spk_logits_e, _, lat_rec_e, _ = model_encoder_excit( cvmelsp_src, sampling=False) logging.info('rec spkpost') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[3]:], dim=-1), 1)) logging.info('rec spkpost_e') if outpad_rights[3] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]: -outpad_rights[3]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[3]:], dim=-1), 1)) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_rec_e.shape[1])) * spk_idx).cuda().long() cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e) if model_decoder_excit.pad_right > 0: lat_cat = torch.cat(( lat_rec_e[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right], lat_rec[:, model_decoder_excit. pad_left:-model_decoder_excit.pad_right]), 2) else: lat_cat = torch.cat( (lat_rec_e[:, model_decoder_excit.pad_left:], lat_rec[:, model_decoder_excit.pad_left:]), 2) if config.spkidtr_dim > 0: src_code = model_spkidtr((torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long()) else: src_code = (torch.ones( (1, lat_cat.shape[1])) * spk_idx).cuda().long() cvmelsp_cyc, _ = model_decoder_melsp( lat_cat, y=src_code, e=cvlf0_cyc[:, :, :config.excit_dim]) if outpad_rights[1] > 0: cvlf0_src = cvlf0_src[:, outpad_lefts[1]: -outpad_rights[1]] else: cvlf0_src = cvlf0_src[:, outpad_lefts[1]:] if outpad_rights[2] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]: -outpad_rights[2]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[2]:] if outpad_rights[4] > 0: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]: -outpad_rights[4]] else: cvlf0_cyc = cvlf0_cyc[:, outpad_lefts[4]:] feat_rec = cvmelsp_src[0].cpu().data.numpy() feat_cyc = cvmelsp_cyc[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64) logging.info(cvlf0_src.shape) logging.info(cvmelsp_src.shape) logging.info(cvlf0_cyc.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) feat_world = read_hdf5(feat_file, "/feat_mceplf0cap") f0 = np.array( np.rint(feat_world[:, 0]) * np.exp(feat_world[:, 1])) codeap = np.array( np.rint(feat_world[:, 2:3]) * (-np.exp(feat_world[:, 3:config.full_excit_dim]))) cvf0_src = np.array( np.rint(cvlf0_src[:, 0]) * np.exp(cvlf0_src[:, 1])) cvcodeap_src = np.array( np.rint(cvlf0_src[:, 2:3]) * (-np.exp(cvlf0_src[:, 3:]))) f0_rmse = np.sqrt(np.mean((cvf0_src - f0)**2)) logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse)) cvf0_src_mean = np.mean(cvf0_src) f0_mean = np.mean(f0) f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_rec: %lf' % (f0_corr)) codeap_rmse = np.sqrt( np.mean((cvcodeap_src - codeap)**2, axis=0)) for i in range(codeap_rmse.shape[-1]): logging.info('codeap-%d_rmse_rec: %lf dB' % (i + 1, codeap_rmse[i])) cvf0_cyc = np.array( np.rint(cvlf0_cyc[:, 0]) * np.exp(cvlf0_cyc[:, 1])) cvcodeap_cyc = np.array( np.rint(cvlf0_cyc[:, 2:3]) * (-np.exp(cvlf0_cyc[:, 3:]))) f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc - f0)**2)) logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc)) cvf0_cyc_mean = np.mean(cvf0_cyc) f0_mean = np.mean(f0) f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\ (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2))) logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc)) codeap_rmse_cyc = np.sqrt( np.mean((cvcodeap_cyc - codeap)**2, axis=0)) for i in range(codeap_rmse_cyc.shape[-1]): logging.info('codeap-%d_rmse_cyc: %lf dB' % (i + 1, codeap_rmse_cyc[i])) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_rec: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) logging.info('org f0') logging.info(f0[10:15]) logging.info('rec f0') logging.info(cvf0_src[10:15]) logging.info('cyc f0') logging.info(cvf0_cyc[10:15]) logging.info('org cap') logging.info(codeap[10:15]) logging.info('rec cap') logging.info(cvcodeap_src[10:15]) logging.info('cyc cap') logging.info(cvcodeap_cyc[10:15]) dataset = feat_file.split('/')[1].split('_')[0] if 'tr' in dataset: logging.info('trn') f0rmse_cvlist.append(f0_rmse) f0corr_cvlist.append(f0_corr) caprmse_cvlist.append(codeap_rmse) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) cvlist.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist)) f0rmse_cvlist_cyc.append(f0_rmse_cyc) f0corr_cvlist_cyc.append(f0_corr_cyc) caprmse_cvlist_cyc.append(codeap_rmse_cyc) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) cvlist_cyc.append(np.var(melsp_cyc_rest, axis=0)) elif 'dv' in dataset: logging.info('dev') f0rmse_cvlist_dv.append(f0_rmse) f0corr_cvlist_dv.append(f0_corr) caprmse_cvlist_dv.append(codeap_rmse) lsd_cvlist_dv.append(lsd_mean) lsdstd_cvlist_dv.append(lsd_std) cvlist_dv.append(np.var(melsp_src_rest, axis=0)) logging.info(len(cvlist_dv)) f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc) f0corr_cvlist_cyc_dv.append(f0_corr_cyc) caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc) lsd_cvlist_cyc_dv.append(lsd_mean_cyc) lsdstd_cvlist_cyc_dv.append(lsd_std_cyc) cvlist_cyc_dv.append(np.var(melsp_cyc_rest, axis=0)) logging.info('write rec to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_rec.shape) write_hdf5(feat_file, args.string_path, feat_rec) logging.info('write cyc to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk + "-" + args.spk + "-" + args.spk) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) logging.info(feat_file + ' ' + args.string_path) logging.info(feat_cyc.shape) write_hdf5(feat_file, args.string_path, feat_cyc) count += 1
def decode_RNN(feat_list, gpu, cvlist=None, lsd_cvlist_src=None, lsdstd_cvlist_src=None, lsd_cvlist_cyc=None, lsdstd_cvlist_cyc=None, lsd_cvlist=None, lsdstd_cvlist=None, lat_dist_rmse_list=None, lat_dist_cosim_list=None): with torch.cuda.device(gpu): # define model and load parameters with torch.no_grad(): model_encoder_melsp = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_melsp) model_decoder_melsp = GRU_SPEC_DECODER( feat_dim=config.lat_dim + config.lat_dim_e, out_dim=config.mel_dim, n_spk=(config.emb_spk_dim // config.n_weight_emb) * config.n_weight_emb, hidden_layers=config.hidden_layers_dec, hidden_units=config.hidden_units_dec, kernel_size=config.kernel_size_dec, dilation_size=config.dilation_size_dec, causal_conv=config.causal_conv_dec, pad_first=True, right_size=config.right_size_dec, red_dim_upd=config.mel_dim, pdf_gauss=True) logging.info(model_decoder_melsp) model_encoder_excit = GRU_VAE_ENCODER( in_dim=config.mel_dim, n_spk=n_spk, lat_dim=config.lat_dim_e, hidden_layers=config.hidden_layers_enc, hidden_units=config.hidden_units_enc, kernel_size=config.kernel_size_enc, dilation_size=config.dilation_size_enc, causal_conv=config.causal_conv_enc, pad_first=True, right_size=config.right_size_enc) logging.info(model_encoder_excit) model_spkidtr = SPKID_TRANSFORM_LAYER( n_spk=n_spk, emb_dim=config.emb_spk_dim, n_weight_emb=config.n_weight_emb, conv_emb_flag=True, spkidtr_dim=config.spkidtr_dim) logging.info(model_spkidtr) model_encoder_melsp.load_state_dict( torch.load(args.model)["model_encoder_melsp"]) model_decoder_melsp.load_state_dict( torch.load(args.model)["model_decoder_melsp"]) model_encoder_excit.load_state_dict( torch.load(args.model)["model_encoder_excit"]) model_spkidtr.load_state_dict( torch.load(args.model)["model_spkidtr"]) model_encoder_melsp.cuda() model_decoder_melsp.cuda() model_encoder_excit.cuda() model_spkidtr.cuda() model_encoder_melsp.eval() model_decoder_melsp.eval() model_encoder_excit.eval() model_spkidtr.eval() model_encoder_melsp.remove_weight_norm() model_decoder_melsp.remove_weight_norm() model_encoder_excit.remove_weight_norm() model_spkidtr.remove_weight_norm() for param in model_encoder_melsp.parameters(): param.requires_grad = False for param in model_decoder_melsp.parameters(): param.requires_grad = False for param in model_encoder_excit.parameters(): param.requires_grad = False for param in model_spkidtr.parameters(): param.requires_grad = False count = 0 pad_left = (model_encoder_melsp.pad_left + model_decoder_melsp.pad_left) * 2 pad_right = (model_encoder_melsp.pad_right + model_decoder_melsp.pad_right) * 2 outpad_lefts = [None] * 3 outpad_rights = [None] * 3 outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left outpad_rights[0] = pad_right - model_encoder_melsp.pad_right outpad_lefts[1] = outpad_lefts[0] - model_decoder_melsp.pad_left outpad_rights[1] = outpad_rights[0] - model_decoder_melsp.pad_right outpad_lefts[2] = outpad_lefts[1] - model_encoder_melsp.pad_left outpad_rights[2] = outpad_rights[1] - model_encoder_melsp.pad_right melfb_t = np.linalg.pinv( librosa.filters.mel(args.fs, args.fftl, n_mels=config.mel_dim)) temp = 0.675 logging.info(f'temp: {temp}') for feat_file in feat_list: # convert melsp spk_src = os.path.basename(os.path.dirname(feat_file)) logging.info('%s --> %s' % (spk_src, args.spk_trg)) file_trg = os.path.join( os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file)) trg_exist = False if os.path.exists(file_trg): logging.info('exist: %s' % (file_trg)) feat_trg = read_hdf5(file_trg, "/log_1pmelmagsp") logging.info(feat_trg.shape) trg_exist = True feat_org = read_hdf5(feat_file, "/log_1pmelmagsp") logging.info(feat_org.shape) logging.info("generate") with torch.no_grad(): feat = F.pad( torch.FloatTensor(feat_org).cuda().unsqueeze( 0).transpose(1, 2), (pad_left, pad_right), "replicate").transpose(1, 2) spk_logits, _, lat_src, _ = model_encoder_melsp( feat, sampling=False) spk_logits_e, _, lat_src_e, _ = model_encoder_excit( feat, sampling=False) logging.info('input spkpost') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[0]:], dim=-1), 1)) logging.info('input spkpost_e') if outpad_rights[0] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]: -outpad_rights[0]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[0]:], dim=-1), 1)) if trg_exist: spk_trg_logits, _, lat_trg, _ = model_encoder_melsp( F.pad( torch.FloatTensor(feat_trg).cuda().unsqueeze( 0).transpose(1, 2), (model_encoder_melsp.pad_left, model_encoder_melsp.pad_right), "replicate").transpose(1, 2), sampling=False) spk_trg_logits_e, _, lat_trg_e, _ = model_encoder_excit( F.pad( torch.FloatTensor(feat_trg).cuda().unsqueeze( 0).transpose(1, 2), (model_encoder_excit.pad_left, model_encoder_excit.pad_right), "replicate").transpose(1, 2), sampling=False) logging.info('target spkpost') logging.info( torch.mean(F.softmax(spk_trg_logits, dim=-1), 1)) logging.info('target spkpost_e') logging.info( torch.mean(F.softmax(spk_trg_logits_e, dim=-1), 1)) _, src_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * src_idx).cuda().long()) _, trg_code = model_spkidtr((torch.ones( (1, lat_src_e.shape[1])) * trg_idx).cuda().long()) lat_cat = torch.cat((lat_src_e, lat_src), 2) _, cvmelsp_src, _ = model_decoder_melsp(lat_cat, y=src_code, temp=temp) _, cvmelsp, _ = model_decoder_melsp(lat_cat, y=trg_code, temp=temp) trj_lat_cat = lat_cat_src = lat_cat spk_logits, _, lat_cv, _ = model_encoder_melsp( cvmelsp, sampling=False) spk_logits_e, _, lat_cv_e, _ = model_encoder_excit( cvmelsp, sampling=False) logging.info('cv spkpost') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits[:, outpad_lefts[2]:], dim=-1), 1)) logging.info('cv spkpost_e') if outpad_rights[2] > 0: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[2]: -outpad_rights[2]], dim=-1), 1)) else: logging.info( torch.mean( F.softmax(spk_logits_e[:, outpad_lefts[2]:], dim=-1), 1)) _, src_code = model_spkidtr((torch.ones( (1, lat_cv_e.shape[1])) * src_idx).cuda().long()) lat_cat = torch.cat((lat_cv_e, lat_cv), 2) _, cvmelsp_cyc, _ = model_decoder_melsp(lat_cat, y=src_code, temp=temp) #if outpad_rights[0] > 0: # trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]] #else: # trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]] if outpad_rights[1] > 0: cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]: -outpad_rights[1]] cvmelsp = cvmelsp[:, outpad_lefts[1]:-outpad_rights[1]] else: cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:] cvmelsp = cvmelsp[:, outpad_lefts[1]:] feat_cv = cvmelsp[0].cpu().data.numpy() #feat_lat = trj_lat_cat[0].cpu().data.numpy() cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(), dtype=np.float64) cvmelsp = np.array(cvmelsp[0].cpu().data.numpy(), dtype=np.float64) cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(), dtype=np.float64) if trg_exist: if outpad_rights[1] > 0: lat_src = lat_cat_src[:, outpad_lefts[0]: -outpad_rights[0]] else: lat_src = lat_cat_src[:, outpad_lefts[0]:] lat_trg = torch.cat((lat_trg_e, lat_trg), 2) logging.info(cvmelsp_src.shape) logging.info(cvmelsp.shape) logging.info(cvmelsp_cyc.shape) melsp = np.array(feat_org) if trg_exist: logging.info(lat_src.shape) logging.info(lat_trg.shape) melsp_trg = np.array(feat_trg) spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0]) melsp_rest = (np.exp(melsp) - 1) / 10000 melsp_cv_rest = (np.exp(cvmelsp) - 1) / 10000 melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000 melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000 cvlist.append(np.var(melsp_cv_rest, axis=0)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_src_cv: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_cvlist_src.append(lsd_mean) lsdstd_cvlist_src.append(lsd_std) if trg_exist: melsp_trg_rest = (np.exp(melsp_trg) - 1) / 10000 spcidx_trg = np.array( read_hdf5(file_trg, "/spcidx_range")[0]) _, twf_melsp, _, _ = dtw.dtw_org_to_trg(np.array(melsp_cv_rest[spcidx], \ dtype=np.float64), np.array(melsp_trg_rest[spcidx_trg], dtype=np.float64), mcd=-1) twf_melsp = np.array(twf_melsp[:, 0]) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cv_rest[twf_melsp], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[twf_melsp], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean = np.mean(lsd_arr) lsd_std = np.std(lsd_arr) logging.info("lsd_trg: %.6f dB +- %.6f" % (lsd_mean, lsd_std)) lsd_cvlist.append(lsd_mean) lsdstd_cvlist.append(lsd_std) spcidx_src = torch.LongTensor(spcidx).cuda() spcidx_trg = torch.LongTensor(spcidx_trg).cuda() trj_lat_src = np.array(torch.index_select( lat_src[0], 0, spcidx_src).cpu().data.numpy(), dtype=np.float64) trj_lat_trg = np.array(torch.index_select( lat_trg[0], 0, spcidx_trg).cpu().data.numpy(), dtype=np.float64) aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg( trj_lat_src, trj_lat_trg) lat_dist_srctrg = np.mean( np.sqrt( np.mean((aligned_lat_srctrg - trj_lat_trg)**2, axis=0))) _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0) aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg( trj_lat_trg, trj_lat_src) lat_dist_trgsrc = np.mean( np.sqrt( np.mean((aligned_lat_trgsrc - trj_lat_src)**2, axis=0))) _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0) logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc)) lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2 lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2 lat_dist_rmse_list.append(lat_dist_rmse) lat_dist_cosim_list.append(lat_dist_cosim) logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim)) lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\ -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1)) lsd_mean_cyc = np.mean(lsd_arr) lsd_std_cyc = np.std(lsd_arr) logging.info("lsd_cyc: %.6f dB +- %.6f" % (lsd_mean_cyc, lsd_std_cyc)) lsd_cvlist_cyc.append(lsd_mean_cyc) lsdstd_cvlist_cyc.append(lsd_std_cyc) logging.info("synth anasyn") magsp = np.matmul(melfb_t, melsp_rest.T) logging.info(magsp.shape) hop_length = int((args.fs / 1000) * args.shiftms) win_length = int((args.fs / 1000) * args.winms) wav = np.clip( librosa.core.griffinlim(magsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_anasyn.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') #if trg_exist: # logging.info("synth anasyn_trg") # wav = np.clip(pw.synthesize(f0_trg, sp_trg, ap_trg, fs, frame_period=args.shiftms), -1, 1) # wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn_trg.wav")) # sf.write(wavpath, wav, fs, 'PCM_16') # logging.info(wavpath) logging.info("synth gf rec") recmagsp = np.matmul(melfb_t, melsp_src_rest.T) logging.info(recmagsp.shape) wav = np.clip( librosa.core.griffinlim(recmagsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info("synth gf cv") cvmagsp = np.matmul(melfb_t, melsp_cv_rest.T) logging.info(cvmagsp.shape) wav = np.clip( librosa.core.griffinlim(cvmagsp, hop_length=hop_length, win_length=win_length, window='hann'), -1, 0.999969482421875) wavpath = os.path.join( args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav")) logging.info(wavpath) sf.write(wavpath, wav, args.fs, 'PCM_16') logging.info('write to h5') outh5dir = os.path.join( os.path.dirname(os.path.dirname(feat_file)), spk_src + "-" + args.spk_trg) if not os.path.exists(outh5dir): os.makedirs(outh5dir) feat_file = os.path.join(outh5dir, os.path.basename(feat_file)) # cv write_path = args.string_path logging.info(feat_file + ' ' + write_path) logging.info(feat_cv.shape) write_hdf5(feat_file, write_path, feat_cv) #logging.info('write lat to h5') #logging.info(feat_file + ' ' + args.string_path+'_lat') #logging.info(feat_lat.shape) #write_hdf5(feat_file, args.string_path+'_lat', feat_lat) count += 1