Esempio n. 1
0
    def decode_RNN(wav_list, gpu, cvlist=None, cvlist_src=None, \
        mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None):
        with torch.cuda.device(gpu):
            mean_trg = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0")[config.stdim:]).cuda()
            std_trg = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info(config)
            logging.info("model")
            with torch.no_grad():
                model_encoder = GRU_RNN_STOCHASTIC(
                    in_dim=config.in_dim,
                    out_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers,
                    hidden_units=config.hidden_units,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    arparam=config.arparam,
                    spk_dim=n_spk,
                    causal_conv=config.causal_conv,
                    scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size_dec,
                                        dilation_size=config.dilation_size_dec,
                                        causal_conv=config.causal_conv,
                                        scale_in_flag=False)
                logging.info(model_encoder)
                logging.info(model_decoder)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                if config.arparam:
                    init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk))
                else:
                    init_pp = np.zeros((1, 1, config.lat_dim + n_spk))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = y_in_trg = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_trg) / std_trg, 0), 0)
            fs = args.fs
            fft_size = args.fftl
            mcep_dim = model_decoder.out_dim - 1
            for wav_file in wav_list:
                # convert mcep
                feat_file = os.path.join(
                    args.h5outdir,
                    os.path.basename(wav_file).replace(".wav", ".h5"))
                logging.info("cvmcep " + feat_file + " " + wav_file)

                fs, x = read_wav(wav_file, cutoff=70)

                time_axis, f0, sp, ap = analyze_range(x, fs=fs, minf0=args.minf0, maxf0=args.maxf0, \
                                                        fperiod=args.shiftms, fftl=args.fftl)
                logging.info(sp.shape)

                mcep = ps.sp2mc(sp, mcep_dim, args.mcep_alpha)
                logging.info(mcep.shape)
                codeap = pw.code_aperiodicity(ap, fs)
                logging.info(codeap.shape)

                npow = spc2npow(sp)
                logging.info(npow.shape)
                _, spcidx = extfrm(mcep, npow, power_threshold=args.pow)
                spcidx = spcidx[0]
                logging.info(spcidx.shape)

                uv, contf0 = convert_continuos_f0(np.array(f0))
                uv = np.expand_dims(uv, axis=-1)
                logging.info(uv.shape)
                cont_f0_lpf = low_pass_filter(contf0,
                                              int(1.0 /
                                                  (args.shiftms * 0.001)),
                                              cutoff=LP_CUTOFF)
                logcontf0 = np.expand_dims(np.log(cont_f0_lpf), axis=-1)
                logging.info(logcontf0.shape)
                feat = np.c_[uv, logcontf0, codeap, mcep]
                logging.info(feat.shape)

                logging.info("generate")
                with torch.no_grad():
                    lat_feat_src, _, _, _, _ = \
                        model_encoder(torch.FloatTensor(feat).cuda(), y_in_pp, sampling=False)

                    src_code = np.zeros((lat_feat_src.shape[0], n_spk))
                    src_code[:, src_code_idx] = 1
                    src_code = torch.FloatTensor(src_code).cuda()

                    trg_code = np.zeros((lat_feat_src.shape[0], n_spk))
                    trg_code[:, trg_code_idx] = 1
                    trg_code = torch.FloatTensor(trg_code).cuda()

                    cvmcep_src, _, _ = model_decoder(
                        torch.cat((src_code, lat_feat_src), 1), y_in_src)
                    cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(),
                                          dtype=np.float64)

                    cvmcep, _, _ = model_decoder(
                        torch.cat((trg_code, lat_feat_src), 1), y_in_trg)
                    cvmcep = np.array(cvmcep.cpu().data.numpy(),
                                      dtype=np.float64)

                logging.info(lat_feat_src.shape)
                logging.info(cvmcep_src.shape)
                logging.info(cvmcep.shape)

                cvf0 = convert_f0(f0, f0_range_mean_src, f0_range_std_src,
                                  f0_range_mean_trg, f0_range_std_trg)
                uv_cv, contf0_cv = convert_continuos_f0(np.array(cvf0))
                uv_cv = np.expand_dims(uv_cv, axis=-1)
                logging.info(uv_cv.shape)
                cont_f0_lpf_cv = low_pass_filter(contf0_cv,
                                                 int(1.0 /
                                                     (args.shiftms * 0.001)),
                                                 cutoff=LP_CUTOFF)
                logcontf0_cv = np.expand_dims(np.log(cont_f0_lpf_cv), axis=-1)
                logging.info(logcontf0_cv.shape)
                feat_cv = np.c_[uv_cv, logcontf0_cv, codeap]
                logging.info(feat_cv.shape)

                feat_cvmcep = np.c_[feat_cv, cvmcep]
                logging.info(feat_cvmcep.shape)
                write_path = '/feat_cvmcep_cycvae-' + model_epoch
                logging.info(feat_file + ' ' + write_path)
                write_hdf5(feat_file, write_path, feat_cvmcep)
                cvlist.append(np.var(cvmcep[:, 1:], axis=0))

                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \
                                                np.array(cvmcep_src[np.array(spcidx),:], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \
                                            np.array(cvmcep_src[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src_cv: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src_cv: %.6f dB +- %.6f" %
                             (mcd_mean, mcd_std))
                mcdpow_cvlist_src.append(mcdpow_mean)
                mcdpowstd_cvlist_src.append(mcdpow_std)
                mcd_cvlist_src.append(mcd_mean)
                mcdstd_cvlist_src.append(mcd_std)
                cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0))

                logging.info("synth voco")
                cvsp = ps.mc2sp(np.ascontiguousarray(cvmcep), args.mcep_alpha,
                                fft_size)
                logging.info(cvsp.shape)
                wav = np.clip(
                    pw.synthesize(cvf0,
                                  cvsp,
                                  ap,
                                  fs,
                                  frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(wav_file).replace(".wav", "_cv.wav"))
                sf.write(wavpath, wav, fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth anasyn")
                wav = np.clip(
                    pw.synthesize(f0, sp, ap, fs, frame_period=args.shiftms),
                    -1, 1)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(wav_file).replace(".wav", "_anasyn.wav"))
                sf.write(wavpath, wav, fs, 'PCM_16')
                logging.info(wavpath)
Esempio n. 2
0
    def decode_RNN(feat_list, gpu, cvlist=None,
            mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\
            mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\
            mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \
            lat_dist_rmse_list=None, lat_dist_cosim_list=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim+config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder)
                model_decoder = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_dec)
                logging.info(model_decoder)
                model_post = GRU_POST_NET(
                    spec_dim=config.mcep_dim,
                    excit_dim=2,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_post,
                    hidden_units=config.hidden_units_post,
                    kernel_size=config.kernel_size_post,
                    dilation_size=config.dilation_size_post,
                    causal_conv=config.causal_conv_post,
                    pad_first=True,
                    right_size=config.right_size_post)
                    #excit_dim=config.excit_dim,
                    #excit_dim=None,
                logging.info(model_post)
                model_encoder.load_state_dict(torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(torch.load(args.model)["model_decoder"])
                model_post.load_state_dict(torch.load(args.model)["model_post"])
                model_encoder.remove_weight_norm()
                model_decoder.remove_weight_norm()
                model_post.remove_weight_norm()
                model_encoder.cuda()
                model_decoder.cuda()
                model_post.cuda()
                model_encoder.eval()
                model_decoder.eval()
                model_post.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                for param in model_post.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2
            pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2
            outpad_lefts = [None]*5
            outpad_rights = [None]*5
            outpad_lefts[0] = pad_left-model_encoder.pad_left
            outpad_rights[0] = pad_right-model_encoder.pad_right
            outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left
            outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right
            outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left
            outpad_rights[2] = outpad_rights[1]-model_post.pad_right
            outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left
            outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right
            outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left
            outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right
            logging.info(f'{pad_left} {pad_right}')
            logging.info(outpad_lefts)
            logging.info(outpad_rights)
            for feat_file in feat_list:
                # convert mcep
                spk_src = os.path.basename(os.path.dirname(feat_file))
                src_idx = spk_list.index(spk_src)
                logging.info('%s --> %s' % (spk_src, args.spk_trg))

                file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file))
                trg_exist = False
                if os.path.exists(file_trg):
                    logging.info('exist: %s' % (file_trg))
                    feat_trg = read_hdf5(file_trg, config.string_path)
                    mcep_trg = feat_trg[:,-config.mcep_dim:]
                    logging.info(mcep_trg.shape)
                    trg_exist = True

                feat_org = read_hdf5(feat_file, config.string_path)
                mcep = np.array(feat_org[:,-config.mcep_dim:])
                codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim])))
                sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl))
                ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl)
                feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1)
                feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]]

                logging.info("generate")
                with torch.no_grad():
                    feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2)
                    feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0)
                    feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0)

                    spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1))

                    if trg_exist:
                        spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \
                                                            (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False)
                        logging.info('target spkpost')
                        logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1))

                    cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src)
                    cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('rec spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src)
                    cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(),
                                        e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('cv spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv)
                    cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))

                    if outpad_rights[2] > 0:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                    else:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                    cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64)

                    if trg_exist:
                        if outpad_rights[0] > 0:
                            lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]]
                        else:
                            lat_src = lat_src[:,outpad_lefts[0]:]

                logging.info(cvmcep_src.shape)
                logging.info(cvmcep.shape)
                logging.info(cvmcep_cyc.shape)

                if trg_exist:
                    logging.info(lat_src.shape)
                    logging.info(lat_trg.shape)
 
                cvlist.append(np.var(cvmcep[:,1:], axis=0))

                logging.info("cvf0lin")
                f0_range = read_hdf5(feat_file, "/f0_range")
                cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std)
                uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin))
                unique, counts = np.unique(uv_range_lin, return_counts=True)
                logging.info(dict(zip(unique, counts)))
                cont_f0_lpf_range_lin = \
                    low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20)
                uv_range_lin = np.expand_dims(uv_range_lin, axis=-1)
                cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1)
                # plain converted feat for neural vocoder
                feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep]
                logging.info(feat_cv.shape)

                logging.info("mcd acc")
                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_src.append(mcdpow_mean)
                mcdpowstd_cvlist_src.append(mcdpow_std)
                mcd_cvlist_src.append(mcd_mean)
                mcdstd_cvlist_src.append(mcd_std)
                if trg_exist:
                    spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0])
                    _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64))
                    _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64))
                    mcdpow_mean = np.mean(mcdpow_arr)
                    mcdpow_std = np.std(mcdpow_arr)
                    mcd_mean = np.mean(mcd_arr)
                    mcd_std = np.std(mcd_arr)
                    logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                    logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    spcidx_src = torch.LongTensor(spcidx).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()
                    trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64)
                    trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64)
                    aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg)
                    lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0)))
                    _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0)
                    aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src)
                    lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0)))
                    _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0)
                    logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc))
                    lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2
                    lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2
                    lat_dist_rmse_list.append(lat_dist_rmse)
                    lat_dist_cosim_list.append(lat_dist_cosim)
                    logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim))
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_cyc.append(mcdpow_mean)
                mcdpowstd_cvlist_cyc.append(mcdpow_std)
                mcd_cvlist_cyc.append(mcd_mean)
                mcdstd_cvlist_cyc.append(mcd_std)

                logging.info("synth anasyn")
                wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco rec")
                cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl)
                logging.info(cvsp_src.shape)
                wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv")
                cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl)
                logging.info(cvsp.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv GV")
                datamean = np.mean(cvmcep[:,1:], axis=0)
                cvmcep_gv =  np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \
                                    (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]]
                cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN)
                cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl)
                logging.info(cvsp_gv.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                #logging.info("synth diffGV")
                #shiftl = int(args.fs/1000*args.shiftms)
                #mc_cv_diff = cvmcep_gv-mcep
                #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha)
                #logging.info(b.shape)
                #assert np.isfinite(b).all
                #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl)
                #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav")))
                #assert(fs_ == args.fs)
                #wav = mlsa_fil.synthesis(x, b)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("synth diffGVF0")
                #time_axis = read_hdf5(feat_file, "/time_axis")
                #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff.shape)
                #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(ap_diff.shape)
                #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("analysis diffGVF0")
                #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff_anasyn.shape)
                #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha)
                #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs)
                ## convert to continouos codeap with uv
                #for i in range(code_ap_diff_anasyn.shape[-1]):
                #    logging.info('codeap: %d' % (i+1))
                #    uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i]))
                #    cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP))
                #    if i > 0:
                #        cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)]
                #    else:
                #        uv_codeap = np.expand_dims(uv_codeap_i, axis=-1)
                #        cont_codeap = np.expand_dims(cont_codeap_i, axis=-1)
                #    uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1)
                #    unique, counts = np.unique(uv_codeap_i, return_counts=True)
                #    logging.info(dict(zip(unique, counts)))
                ## postprocessed converted feat for neural vocoder
                #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn]

                #logging.info("write lat")
                #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file)))
                #if not os.path.exists(outTxtDir):
                #    os.mkdir(outTxtDir)
                #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt"))
                #logging.info(outTxt)
                #g = open(outTxt, "wt")
                #idx_frm = 0 
                #nfrm = trj_lat_src.shape[0]
                #dim = trj_lat_src.shape[1]
                #if not args.time_flag:
                ##if True:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                g.write("%lf " % (elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #else:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                if idx_elmt > 1:
                #                    g.write("%lf " % (elmt))
                #                else:
                #                    g.write("%lf %lf " % (time_axis[idx_frm], elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #g.close()

                logging.info('write to h5')
                outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                # cv
                write_path = args.string_path
                logging.info(feat_file + ' ' + write_path)
                logging.info(feat_cv.shape)
                write_hdf5(feat_file, write_path, feat_cv)
                ## diffGVF0
                #write_path = args.string_path+"_diffgvf0"
                #logging.info(feat_file + ' ' + write_path)
                #logging.info(feat_diffgv_anasyn.shape)
                #write_hdf5(feat_file, write_path, feat_diffgv_anasyn)

                count += 1
Esempio n. 3
0
    def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \
                    mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \
                    mcd_cvlist_dv=None, mcdstd_cvlist_dv=None):
        with torch.cuda.device(gpu):
            mean_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0")[config.stdim:]).cuda()
            std_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info("model")
            logging.info(config)
            with torch.no_grad():
                model_encoder = GRU_RNN_STOCHASTIC(
                    in_dim=config.in_dim,
                    out_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers,
                    hidden_units=config.hidden_units,
                    kernel_size=config.kernel_size,
                    dilation_size=config.dilation_size,
                    spk_dim=n_spk,
                    scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_in_flag=False)
                logging.info(model_encoder)
                logging.info(model_decoder)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = y_in_trg = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0)
            for feat_file in feat_list:
                # convert mcep
                logging.info("recmcep " + feat_file)

                feat = read_hdf5(feat_file, "/feat_org_lf0")
                logging.info(feat.shape)
                f0 = read_hdf5(feat_file, "/f0_range")
                cvf0 = convert_f0(f0, f0_range_mean_trg, f0_range_std_trg,
                                  f0_range_mean_src, f0_range_std_src)
                cvuv, cont_f0 = convert_continuos_f0(cvf0)
                cvuv = np.expand_dims(cvuv, axis=-1)
                cont_f0_lpf = low_pass_filter(cont_f0,
                                              int(1.0 /
                                                  (args.shiftms * 0.001)),
                                              cutoff=LP_CUTOFF)
                if np.min(cont_f0_lpf) <= 0:
                    length = len(cont_f0_lpf)
                    for i in range(length):
                        if cont_f0_lpf[i] <= 0:
                            if i > 0 and i < length - 1:
                                for j in range(i - 1, -1, -1):
                                    if cont_f0_lpf[j] > 0:
                                        left_val = cont_f0_lpf[j]
                                        break
                                for j in range(i + 1, length):
                                    if cont_f0_lpf[j] > 0:
                                        right_val = cont_f0_lpf[j]
                                        break
                                cont_f0_lpf[i] = (left_val + right_val) / 2
                            elif i == 0:
                                for j in range(1, length):
                                    if cont_f0_lpf[j] > 0:
                                        right_val = cont_f0_lpf[j]
                                        break
                                cont_f0_lpf[i] = right_val
                            else:
                                for j in range(i - 1, -1, -1):
                                    if cont_f0_lpf[j] > 0:
                                        left_val = cont_f0_lpf[j]
                                        break
                                cont_f0_lpf[i] = left_val
                cvlogf0fil = np.expand_dims(np.log(cont_f0_lpf), axis=-1)
                feat_cv = np.c_[cvuv, cvlogf0fil, feat[:, 2:config.stdim]]
                with torch.no_grad():
                    lat_feat, _, _, _, _ = model_encoder(torch.FloatTensor(feat).cuda(), \
                                                        y_in_pp, sampling=False)
                    src_code = np.zeros((lat_feat.shape[0], n_spk))
                    src_code[:, src_code_idx] = 1
                    src_code = torch.FloatTensor(src_code).cuda()
                    cvmcep, _, _ = model_decoder(
                        torch.cat((src_code, lat_feat), 1), y_in_src)
                    lat_feat, _, _, _, _ = model_encoder(torch.cat((torch.FloatTensor(feat_cv).cuda(), cvmcep),1), \
                                                        y_in_pp, sampling=False)
                    trg_code = np.zeros((lat_feat.shape[0], n_spk))
                    trg_code[:, trg_code_idx] = 1
                    trg_code = torch.FloatTensor(trg_code).cuda()
                    cvmcep, _, _ = model_decoder(
                        torch.cat((trg_code, lat_feat), 1), y_in_trg)

                cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64)
                logging.info(cvmcep.shape)

                mcep = feat[:, config.stdim:]
                spcidx = read_hdf5(feat_file, "/spcidx_range")[0]
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \
                                            np.array(cvmcep[np.array(spcidx),:], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \
                                            np.array(cvmcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    cvlist.append(np.var(cvmcep[:, 1:], axis=0))
                    logging.info(len(cvlist))
                elif 'dv' in dataset:
                    logging.info('dev')
                    mcdpow_cvlist_dv.append(mcdpow_mean)
                    mcdpowstd_cvlist_dv.append(mcdpow_std)
                    mcd_cvlist_dv.append(mcd_mean)
                    mcdstd_cvlist_dv.append(mcd_std)
                    cvlist_dv.append(np.var(cvmcep[:, 1:], axis=0))
                    logging.info(len(cvlist_dv))
                logging.info("mcdpow_rec: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std))

                feat_cvmcep = np.c_[feat[:, :config.stdim], cvmcep]
                logging.info(feat_cvmcep.shape)
                write_path = '/feat_recmcep_cycvae-' + model_epoch
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    spk_trg + "-" + spk_src + "-" + spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file_cyc = os.path.join(outh5dir,
                                             os.path.basename(feat_file))
                logging.info(feat_file_cyc + ' ' + write_path)
                write_hdf5(feat_file_cyc, write_path, feat_cvmcep)
    def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \
                    mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \
                    mcd_cvlist_dv=None, mcdstd_cvlist_dv=None):
        with torch.cuda.device(gpu):
            mean_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0")[config.stdim:]).cuda()
            std_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info("model")
            logging.info(config)
            with torch.no_grad():
                model_encoder = GRU_RNN_STOCHASTIC(
                    in_dim=config.in_dim,
                    out_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers,
                    hidden_units=config.hidden_units,
                    kernel_size=config.kernel_size,
                    dilation_size=config.dilation_size,
                    spk_dim=n_spk,
                    scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_in_flag=False)
                logging.info(model_encoder)
                logging.info(model_decoder)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0)
            for feat_file in feat_list:
                # convert mcep
                logging.info("recmcep " + feat_file)

                feat = read_hdf5(feat_file, "/feat_org_lf0")
                logging.info(feat.shape)
                with torch.no_grad():
                    lat_feat, _, _, _, _ = model_encoder(torch.FloatTensor(feat).cuda(), \
                                                        y_in_pp, sampling=False)
                    spk_code = np.zeros((lat_feat.shape[0], n_spk))
                    spk_code[:, spk_code_idx] = 1
                    spk_code = torch.FloatTensor(spk_code).cuda()
                    cvmcep, _, _ = model_decoder(
                        torch.cat((spk_code, lat_feat), 1), y_in_src)

                cvmcep = np.array(cvmcep.cpu().data.numpy(), dtype=np.float64)
                logging.info(cvmcep.shape)

                mcep = feat[:, config.stdim:]
                spcidx = read_hdf5(feat_file, "/spcidx_range")[0]
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \
                                            np.array(cvmcep[np.array(spcidx),:], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \
                                            np.array(cvmcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    cvlist.append(np.var(cvmcep[:, 1:], axis=0))
                    logging.info(len(cvlist))
                elif 'dv' in dataset:
                    logging.info('dev')
                    mcdpow_cvlist_dv.append(mcdpow_mean)
                    mcdpowstd_cvlist_dv.append(mcdpow_std)
                    mcd_cvlist_dv.append(mcd_mean)
                    mcdstd_cvlist_dv.append(mcd_std)
                    cvlist_dv.append(np.var(cvmcep[:, 1:], axis=0))
                    logging.info(len(cvlist_dv))
                logging.info("mcdpow_rec: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std))

                logging.info("mod_pow")
                cvmcep = mod_pow(cvmcep,
                                 mcep,
                                 alpha=args.mcep_alpha,
                                 irlen=IRLEN)
                logging.info(cvmcep.shape)
                feat_cvmcep = np.c_[feat[:, :config.stdim], cvmcep]
                logging.info(feat_cvmcep.shape)
                write_path = '/feat_recmcep_cycvae-' + model_epoch
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    spk + "-" + spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + write_path)
                write_hdf5(feat_file, write_path, feat_cvmcep)
Esempio n. 5
0
    def gpu_decode(feat_list,
                   feat_trg_list,
                   gpu,
                   cvlist=None,
                   mcdlist=None,
                   mcdstdlist=None,
                   mcdpowlist=None,
                   mcdpowstdlist=None,
                   cvlist_src=None,
                   mcdlist_src=None,
                   mcdstdlist_src=None,
                   mcdpowlist_src=None,
                   mcdpowstdlist_src=None,
                   cvlist_trg=None,
                   mcdlist_trg=None,
                   mcdstdlist_trg=None,
                   mcdpowlist_trg=None,
                   mcdpowstdlist_trg=None,
                   lat_dist_rmse_enc_list=None,
                   lat_dist_cosim_enc_list=None,
                   lat_dist_rmse_pri_list=None,
                   lat_dist_cosim_pri_list=None):
        with torch.cuda.device(gpu):
            mean_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0_jnt")[config.stdim:]).cuda()
            std_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0_jnt")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info("model")
            logging.info(config)
            with torch.no_grad():
                model_encoder = GRU_RNN(in_dim=config.in_dim,
                                        out_dim=config.lat_dim * 2,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + 2,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_in_flag=False)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                logging.info(model_encoder)
                logging.info(model_decoder)
                init_pp = np.zeros((1, 1, config.lat_dim * 2))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = y_in_trg = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0)
            for feat_file, feat_trg_file in zip(feat_list, feat_trg_list):
                # convert mcep
                logging.info("cvmcep " + feat_file + " " + feat_trg_file)

                feat = read_hdf5(feat_file, "/feat_org_lf0")
                feat_trg = read_hdf5(feat_trg_file, "/feat_org_lf0")
                logging.info(feat.shape)
                logging.info(feat_trg.shape)
                with torch.no_grad():
                    lat_src, _, _ = model_encoder(
                        torch.FloatTensor(feat).cuda(),
                        y_in_pp,
                        clamp_vae=True,
                        lat_dim=config.lat_dim)
                    lat_feat = sampling_vae_batch(lat_src.unsqueeze(0).repeat(
                        args.n_smpl_dec, 1, 1),
                                                  lat_dim=config.lat_dim)
                    lat_feat = torch.mean(lat_feat, 0)
                    lat_trg, _, _ = model_encoder(
                        torch.FloatTensor(feat_trg).cuda(),
                        y_in_pp,
                        clamp_vae=True,
                        lat_dim=config.lat_dim)
                    lat_feat_trg = sampling_vae_batch(
                        lat_trg.unsqueeze(0).repeat(args.n_smpl_dec, 1, 1),
                        lat_dim=config.lat_dim)
                    lat_feat_trg = torch.mean(lat_feat_trg, 0)
                    src_code = np.zeros((lat_feat.shape[0], 2))
                    trg_code = np.zeros((lat_feat.shape[0], 2))
                    trg_trg_code = np.zeros((lat_feat_trg.shape[0], 2))
                    src_code[:, 0] = 1
                    trg_code[:, 1] = 1
                    trg_trg_code[:, 1] = 1
                    src_code = torch.FloatTensor(src_code).cuda()
                    trg_code = torch.FloatTensor(trg_code).cuda()
                    trg_trg_code = torch.FloatTensor(trg_trg_code).cuda()
                    cvmcep, _, _ = model_decoder(
                        torch.cat((trg_code, lat_feat), 1), y_in_trg)
                    cvmcep = np.array(cvmcep.cpu().data.numpy(),
                                      dtype=np.float64)
                    cvmcep_src, _, _ = model_decoder(
                        torch.cat((src_code, lat_feat), 1), y_in_src)
                    cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(),
                                          dtype=np.float64)
                    cvmcep_trg, _, _ = model_decoder(
                        torch.cat((trg_trg_code, lat_feat_trg), 1), y_in_trg)
                    cvmcep_trg = np.array(cvmcep_trg.cpu().data.numpy(),
                                          dtype=np.float64)

                logging.info(cvmcep.shape)
                logging.info(cvmcep_trg.shape)
                cvlist.append(np.var(cvmcep[:, 1:], axis=0))
                cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0))
                cvlist_trg.append(np.var(cvmcep_trg[:, 1:], axis=0))
                logging.info(len(cvlist))

                spcidx_src = read_hdf5(feat_file, "/spcidx_range")[0]
                mcep_trg = read_hdf5(feat_trg_file, "/mcepspc_range")
                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(
                    np.array(cvmcep[np.array(spcidx_src), :],
                             dtype=np.float64),
                    np.array(mcep_trg[:, :], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(
                    np.array(cvmcep[np.array(spcidx_src), 1:],
                             dtype=np.float64),
                    np.array(mcep_trg[:, 1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist.append(mcdpow_mean)
                mcdpowstdlist.append(mcdpow_std)
                mcdlist.append(mcd_mean)
                mcdstdlist.append(mcd_std)

                mcep_src = read_hdf5(feat_file, "/mcepspc_range")
                _, mcdpow_arr = dtw.calc_mcd(
                    np.array(mcep_src[:, :], dtype=np.float64),
                    np.array(cvmcep_src[np.array(spcidx_src), :],
                             dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(
                    np.array(mcep_src[:, 1:], dtype=np.float64),
                    np.array(cvmcep_src[np.array(spcidx_src), 1:],
                             dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist_src.append(mcdpow_mean)
                mcdpowstdlist_src.append(mcdpow_std)
                mcdlist_src.append(mcd_mean)
                mcdstdlist_src.append(mcd_std)

                spcidx_trg = read_hdf5(feat_trg_file, "/spcidx_range")[0]
                _, mcdpow_arr = dtw.calc_mcd(
                    np.array(mcep_trg[:, :], dtype=np.float64),
                    np.array(cvmcep_trg[np.array(spcidx_trg), :],
                             dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(
                    np.array(mcep_trg[:, 1:], dtype=np.float64),
                    np.array(cvmcep_trg[np.array(spcidx_trg), 1:],
                             dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_trg: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist_trg.append(mcdpow_mean)
                mcdpowstdlist_trg.append(mcdpow_std)
                mcdlist_trg.append(mcd_mean)
                mcdstdlist_trg.append(mcd_std)

                with torch.no_grad():
                    spcidx_src = torch.LongTensor(spcidx_src).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()

                trj_lat_src = np.array(torch.index_select(
                    lat_src, 0, spcidx_src).cpu().data.numpy(),
                                       dtype=np.float64)
                trj_lat_trg = np.array(torch.index_select(
                    lat_trg, 0, spcidx_trg).cpu().data.numpy(),
                                       dtype=np.float64)
                aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_src, trj_lat_trg)
                lat_dist_srctrg = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                axis=0)))
                _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                               trj_lat_src,
                                                               mcd=0)
                aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_trg, trj_lat_src)
                lat_dist_trgsrc = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                axis=0)))
                _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                               trj_lat_trg,
                                                               mcd=0)
                logging.info("%lf %lf %lf %lf" %
                             (lat_dist_srctrg, lat_cdist_srctrg,
                              lat_dist_trgsrc, lat_cdist_trgsrc))
                lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                lat_dist_rmse_enc_list.append(lat_dist_rmse)
                lat_dist_cosim_enc_list.append(lat_dist_cosim)
                logging.info("lat_dist_enc: %.6f %.6f" %
                             (lat_dist_rmse, lat_dist_cosim))

                trj_lat_src = np.array(torch.index_select(
                    lat_feat, 0, spcidx_src).cpu().data.numpy(),
                                       dtype=np.float64)
                trj_lat_trg = np.array(torch.index_select(
                    lat_feat_trg, 0, spcidx_trg).cpu().data.numpy(),
                                       dtype=np.float64)
                aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_src, trj_lat_trg)
                lat_dist_srctrg = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                axis=0)))
                _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                               trj_lat_src,
                                                               mcd=0)
                aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_trg, trj_lat_src)
                lat_dist_trgsrc = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                axis=0)))
                _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                               trj_lat_trg,
                                                               mcd=0)
                logging.info("%lf %lf %lf %lf" %
                             (lat_dist_srctrg, lat_cdist_srctrg,
                              lat_dist_trgsrc, lat_cdist_trgsrc))
                lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                lat_dist_rmse_pri_list.append(lat_dist_rmse)
                lat_dist_cosim_pri_list.append(lat_dist_cosim)
                logging.info("lat_dist_pri: %.6f %.6f" %
                             (lat_dist_rmse, lat_dist_cosim))