コード例 #1
0
ファイル: twf.py プロジェクト: dasanurag/Diff-Gmm-based-VC
def estimate_twf(orgdata, tardata, distance='melcd', fast=True, otflag=None):
    """time warping function estimator

    Parameters
    ---------
    orgdata : array, shape(`T_org`, `dim`)
        Array of source feature
    tardata : array, shape(`T_tar`, `dim`)
        Array of target feature
    distance : str, optional
        distance function
        `melcd` : mel-cepstrum distortion
    fast : bool, optional
        Use fastdtw instead of dtw
        Default set to `True`
    otflag : str,
        Perform alignment into either original or target length
        `org` : align into original length
        `tar` : align into target length
        Default set to None

    Returns
    ---------
    twf : array, shape(`2`, `T`)
        Time warping function between original and target
    """

    if distance == 'melcd':

        def distance_func(x, y):
            return melcd(x, y)
    else:
        raise ValueError('other distance metrics than melcd does not support.')

    if otflag is None:
        # use dtw or fastdtw
        if fast:
            _, path = fastdtw(orgdata, tardata, dist=distance_func)
            twf = np.array(path).T
        else:
            _, _, _, twf = dtw(orgdata, tardata, distance_func)
    else:
        # use dtw_c to align target/original feature vector
        ldim = orgdata.shape[1] - 1
        if otflag == 'org':
            _, twf, _, _ = dtw_c.dtw_org_to_trg(tardata, orgdata, 0, ldim, 5.0,
                                                100.0, 100.0)
        else:
            _, twf, _, _ = dtw_c.dtw_org_to_trg(orgdata, tardata, 0, ldim, 5.0,
                                                100.0, 100.0)
        twf[:, 1] = np.array(range(
            twf.shape[0]))  # replace target index by frame number
        twf = twf.T
        if otflag == 'org':
            twf = twf[::-1, :]  # swap cols
            assert twf.shape[0] == orgdata.shape[0]
        else:
            assert twf.shape[1] == tardata.shape[0]

    return twf
コード例 #2
0
    def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \
                    mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \
                    mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \
                    cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \
                    mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \
                    mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim + config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=config.bi_enc,
                    cont=False,
                    pad_first=True,
                    right_size=config.right_size,
                    ar=config.ar_enc)
                logging.info(model_encoder)
                model_decoder = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=config.bi_dec,
                    pad_first=True,
                    ar=config.ar_dec)
                logging.info(model_decoder)
                model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim)
                logging.info(model_vq)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_vq.load_state_dict(torch.load(args.model)["model_vq"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_vq.cuda()
                model_encoder.eval()
                model_decoder.eval()
                model_vq.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                for param in model_vq.parameters():
                    param.requires_grad = False
                if config.ar_enc:
                    yz_in = torch.zeros((1, 1, n_spk + config.lat_dim)).cuda()
                if config.ar_dec:
                    mean_stats = torch.FloatTensor(
                        read_hdf5(
                            config.stats,
                            "/mean_" + config.string_path.replace("/", "")))
                    scale_stats = torch.FloatTensor(
                        read_hdf5(
                            config.stats,
                            "/scale_" + config.string_path.replace("/", "")))
                    x_in = ((torch.zeros((1, 1, config.mcep_dim)) -
                             mean_stats[config.excit_dim:]) /
                            scale_stats[config.excit_dim:]).cuda()
            count = 0
            pad_left = (model_encoder.pad_left + model_decoder.pad_left) * 2
            pad_right = (model_encoder.pad_right + model_decoder.pad_right) * 2
            outpad_lefts = [None] * 3
            outpad_rights = [None] * 3
            outpad_lefts[0] = pad_left - model_encoder.pad_left
            outpad_rights[0] = pad_right - model_encoder.pad_right
            outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left
            outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right
            outpad_lefts[2] = outpad_lefts[1] - model_encoder.pad_left
            outpad_rights[2] = outpad_rights[1] - model_encoder.pad_right
            for feat_file in feat_list:
                # convert mcep
                logging.info("recmcep " + feat_file)

                feat_org = read_hdf5(feat_file, "/feat_mceplf0cap")
                logging.info(feat_org.shape)
                mcep = np.array(feat_org[:, -model_decoder.out_dim:])

                with torch.no_grad():
                    feat = torch.FloatTensor(feat_org).cuda().unsqueeze(0)
                    feat_excit = feat[:, :, :config.excit_dim]

                    if config.ar_enc:
                        spk_logits, lat_src, _, _ = model_encoder(F.pad(feat.transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2), \
                                                            yz_in=yz_in)
                    else:
                        spk_logits, lat_src, _ = model_encoder(
                            F.pad(feat.transpose(1, 2), (pad_left, pad_right),
                                  "replicate").transpose(1, 2))
                    idx_vq = nn_search_batch(lat_src, model_vq.weight)
                    lat_src = model_vq(idx_vq)
                    if outpad_rights[0] > 0:
                        unique, counts = np.unique(
                            idx_vq[:, outpad_lefts[0]:-outpad_rights[0]].cpu(
                            ).data.numpy(),
                            return_counts=True)
                    else:
                        unique, counts = np.unique(
                            idx_vq[:, outpad_lefts[0]:].cpu().data.numpy(),
                            return_counts=True)
                    logging.info("input vq")
                    logging.info(dict(zip(unique, counts)))
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:
                                                     -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:],
                                          dim=-1), 1))

                    src_code = (torch.ones(
                        (1, lat_src.shape[1])) * spk_idx).cuda().long()
                    if config.ar_dec:
                        cvmcep_src, _, _ = model_decoder(src_code,
                                                         lat_src,
                                                         x_in=x_in)
                    else:
                        cvmcep_src, _ = model_decoder(src_code, lat_src)

                    if config.ar_enc:
                        spk_logits, lat_rec, _, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2),
                                                            yz_in=yz_in)
                    else:
                        spk_logits, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2), cvmcep_src), 2))
                    idx_vq = nn_search_batch(lat_rec, model_vq.weight)
                    lat_rec = model_vq(idx_vq)
                    if outpad_rights[2] > 0:
                        unique, counts = np.unique(
                            idx_vq[:, outpad_lefts[2]:-outpad_rights[2]].cpu(
                            ).data.numpy(),
                            return_counts=True)
                    else:
                        unique, counts = np.unique(
                            idx_vq[:, outpad_lefts[2]:].cpu().data.numpy(),
                            return_counts=True)
                    logging.info("rec vq")
                    logging.info(dict(zip(unique, counts)))
                    logging.info('rec spkpost')
                    if outpad_rights[2] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:
                                                     -outpad_rights[2]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:],
                                          dim=-1), 1))

                    src_code = (torch.ones(
                        (1, lat_rec.shape[1])) * spk_idx).cuda().long()
                    if config.ar_dec:
                        cvmcep_cyc, _, _ = model_decoder(src_code,
                                                         lat_rec,
                                                         x_in=x_in)
                    else:
                        cvmcep_cyc, _ = model_decoder(src_code, lat_rec)

                    if outpad_rights[1] > 0:
                        cvmcep_src = cvmcep_src[:, outpad_lefts[1]:
                                                -outpad_rights[1]]
                    else:
                        cvmcep_src = cvmcep_src[:, outpad_lefts[1]:]

                    feat_rec = torch.cat((feat_excit, cvmcep_src),
                                         2)[0].cpu().data.numpy()
                    feat_cyc = torch.cat((feat_excit, cvmcep_cyc),
                                         2)[0].cpu().data.numpy()

                    cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(),
                                          dtype=np.float64)
                    cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(),
                                          dtype=np.float64)

                logging.info(cvmcep_src.shape)
                logging.info(cvmcep_cyc.shape)

                spcidx = read_hdf5(feat_file, "/spcidx_range")[0]

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_rec: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std))

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean_cyc = np.mean(mcdpow_arr)
                mcdpow_std_cyc = np.std(mcdpow_arr)
                mcd_mean_cyc = np.mean(mcd_arr)
                mcd_std_cyc = np.std(mcd_arr)
                logging.info("mcdpow_cyc: %.6f dB +- %.6f" %
                             (mcdpow_mean_cyc, mcdpow_std_cyc))
                logging.info("mcd_cyc: %.6f dB +- %.6f" %
                             (mcd_mean_cyc, mcd_std_cyc))

                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    cvlist.append(np.var(cvmcep_src[:, 1:], axis=0))
                    logging.info(len(cvlist))
                    mcdpow_cvlist_cyc.append(mcdpow_mean_cyc)
                    mcdpow_cvlist_cyc.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc.append(mcd_std_cyc)
                    cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0))
                elif 'dv' in dataset:
                    logging.info('dev')
                    mcdpow_cvlist_dv.append(mcdpow_mean)
                    mcdpowstd_cvlist_dv.append(mcdpow_std)
                    mcd_cvlist_dv.append(mcd_mean)
                    mcdstd_cvlist_dv.append(mcd_std)
                    cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0))
                    logging.info(len(cvlist_dv))
                    mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc)
                    mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc_dv.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc_dv.append(mcd_std_cyc)
                    cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0))

                logging.info('write rec to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_rec.shape)
                write_hdf5(feat_file, args.string_path, feat_rec)

                logging.info('write cyc to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_cyc.shape)
                write_hdf5(feat_file, args.string_path, feat_cyc)

                count += 1
    def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \
                    mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \
                    mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \
                    f0rmse_cvlist=None, f0corr_cvlist=None, caprmse_cvlist=None, \
                    f0rmse_cvlist_dv=None, f0corr_cvlist_dv=None, caprmse_cvlist_dv=None, \
                    cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \
                    mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \
                    mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None, \
                    f0rmse_cvlist_cyc=None, f0corr_cvlist_cyc=None, caprmse_cvlist_cyc=None, \
                    f0rmse_cvlist_cyc_dv=None, f0corr_cvlist_cyc_dv=None, caprmse_cvlist_cyc_dv=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder_mcep = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim+config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=config.bi_enc,
                    cont=False,
                    pad_first=True,
                    right_size=config.right_size,
                    ar=config.ar_enc)
                logging.info(model_encoder_mcep)
                model_decoder_mcep = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=config.bi_dec,
                    spkidtr_dim=config.spkidtr_dim,
                    pad_first=True,
                    ar=config.ar_dec)
                logging.info(model_decoder_mcep)
                model_encoder_excit = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim+config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim_e,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=config.bi_enc,
                    cont=False,
                    pad_first=True,
                    right_size=config.right_size,
                    ar=config.ar_enc)
                logging.info(model_encoder_excit)
                model_decoder_excit = GRU_EXCIT_DECODER(
                    feat_dim=config.lat_dim_e,
                    cap_dim=config.cap_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_lf0,
                    hidden_units=config.hidden_units_lf0,
                    kernel_size=config.kernel_size_lf0,
                    dilation_size=config.dilation_size_lf0,
                    causal_conv=config.causal_conv_lf0,
                    bi=config.bi_lf0,
                    spkidtr_dim=config.spkidtr_dim,
                    pad_first=True,
                    ar=config.ar_f0)
                logging.info(model_decoder_excit)
                model_vq = torch.nn.Embedding(config.ctr_size, config.lat_dim)
                logging.info(model_vq)
                model_encoder_mcep.load_state_dict(torch.load(args.model)["model_encoder_mcep"])
                model_decoder_mcep.load_state_dict(torch.load(args.model)["model_decoder_mcep"])
                model_encoder_excit.load_state_dict(torch.load(args.model)["model_encoder_excit"])
                model_decoder_excit.load_state_dict(torch.load(args.model)["model_decoder_excit"])
                model_vq.load_state_dict(torch.load(args.model)["model_vq"])
                model_encoder_mcep.cuda()
                model_decoder_mcep.cuda()
                model_encoder_excit.cuda()
                model_decoder_excit.cuda()
                model_vq.cuda()
                model_encoder_mcep.eval()
                model_decoder_mcep.eval()
                model_encoder_excit.eval()
                model_decoder_excit.eval()
                model_vq.eval()
                for param in model_encoder_mcep.parameters():
                    param.requires_grad = False
                for param in model_decoder_mcep.parameters():
                    param.requires_grad = False
                for param in model_encoder_excit.parameters():
                    param.requires_grad = False
                for param in model_decoder_excit.parameters():
                    param.requires_grad = False
                for param in model_vq.parameters():
                    param.requires_grad = False
                if config.ar_enc:
                    yz_in = torch.zeros((1, 1, n_spk+config.lat_dim)).cuda()
                    yz_in_e = torch.zeros((1, 1, n_spk+config.lat_dim_e)).cuda()
                if config.ar_dec or config.ar_f0:
                    mean_stats = torch.FloatTensor(read_hdf5(config.stats, "/mean_"+config.string_path.replace("/","")))
                    scale_stats = torch.FloatTensor(read_hdf5(config.stats, "/scale_"+config.string_path.replace("/","")))
                if config.ar_dec:
                    x_in = ((torch.zeros((1, 1, config.mcep_dim))-mean_stats[config.excit_dim:])/scale_stats[config.excit_dim:]).cuda()
                if config.ar_f0:
                    e_in = torch.cat((torch.zeros(1,1,1), (torch.zeros(1,1,1)-mean_stats[1:2])/scale_stats[1:2], \
                                    torch.zeros(1,1,1), (torch.zeros(1,1,config.cap_dim)-mean_stats[3:config.excit_dim])/scale_stats[3:config.excit_dim]), 2).cuda()
            count = 0
            pad_left = (model_encoder_mcep.pad_left + model_decoder_mcep.pad_left)*2
            pad_right = (model_encoder_mcep.pad_right + model_decoder_mcep.pad_right)*2
            outpad_lefts = [None]*3
            outpad_rights = [None]*3
            outpad_lefts[0] = pad_left-model_encoder_mcep.pad_left
            outpad_rights[0] = pad_right-model_encoder_mcep.pad_right
            outpad_lefts[1] = outpad_lefts[0]-model_decoder_mcep.pad_left
            outpad_rights[1] = outpad_rights[0]-model_decoder_mcep.pad_right
            outpad_lefts[2] = outpad_lefts[1]-model_encoder_mcep.pad_left
            outpad_rights[2] = outpad_rights[1]-model_encoder_mcep.pad_right
            for feat_file in feat_list:
                # convert mcep
                logging.info("recmcep " + feat_file)

                feat_org = read_hdf5(feat_file, "/feat_mceplf0cap")
                logging.info(feat_org.shape)

                with torch.no_grad():
                    feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2)

                    if config.ar_enc:
                        spk_logits, lat_src, _, _ = model_encoder_mcep(feat, yz_in=yz_in)
                        spk_logits_e, lat_src_e, _, _ = model_encoder_excit(feat, yz_in=yz_in)
                    else:
                        spk_logits, lat_src, _ = model_encoder_mcep(feat)
                        spk_logits_e, lat_src_e, _ = model_encoder_excit(feat)
                    idx_vq = nn_search_batch(lat_src, model_vq.weight)
                    lat_src = model_vq(idx_vq)
                    if outpad_rights[0] > 0:
                        unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True)
                    else:
                        unique, counts = np.unique(idx_vq[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True)
                    logging.info("input vq")
                    logging.info(dict(zip(unique, counts)))
                    idx_vq_e = nn_search_batch(lat_src_e, model_vq.weight)
                    lat_src_e = model_vq(idx_vq_e)
                    if outpad_rights[0] > 0:
                        unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:-outpad_rights[0]].cpu().data.numpy(), return_counts=True)
                    else:
                        unique, counts = np.unique(idx_vq_e[:,outpad_lefts[0]:].cpu().data.numpy(), return_counts=True)
                    logging.info("input vq_e")
                    logging.info(dict(zip(unique, counts)))
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1))
                    logging.info('input spkpost_e')
                    if outpad_rights[0] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[0]:], dim=-1), 1))

                    src_code = (torch.ones((1, lat_src.shape[1]))*spk_idx).cuda().long()

                    if config.ar_dec:
                        cvmcep_src, _, _ = model_decoder_mcep(src_code, lat_src, x_in=x_in)
                    else:
                        cvmcep_src, _ = model_decoder_mcep(src_code, lat_src)
                    if config.ar_f0:
                        cvlf0_src, _, _ = model_decoder_excit(src_code, lat_src_e, e_in=e_in)
                    else:
                        cvlf0_src, _ = model_decoder_excit(src_code, lat_src_e)

                    cv_feat = torch.cat((cvlf0_src, cvmcep_src), 2)
                    if config.ar_enc:
                        spk_logits, lat_rec, _, _ = model_encoder_mcep(cv_feat, yz_in=yz_in)
                        spk_logits_e, lat_rec_e, _, _ = model_encoder_excit(cv_feat, yz_in=yz_in)
                    else:
                        spk_logits, lat_rec, _ = model_encoder_mcep(cv_feat)
                        spk_logits_e, lat_rec_e, _ = model_encoder_excit(cv_feat)
                    idx_vq = nn_search_batch(lat_rec, model_vq.weight)
                    lat_rec = model_vq(idx_vq)
                    if outpad_rights[2] > 0:
                        unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True)
                    else:
                        unique, counts = np.unique(idx_vq[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True)
                    logging.info("input vq")
                    logging.info(dict(zip(unique, counts)))
                    idx_vq_e = nn_search_batch(lat_rec_e, model_vq.weight)
                    lat_rec_e = model_vq(idx_vq_e)
                    if outpad_rights[2] > 0:
                        unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), return_counts=True)
                    else:
                        unique, counts = np.unique(idx_vq_e[:,outpad_lefts[2]:].cpu().data.numpy(), return_counts=True)
                    logging.info("input vq_e")
                    logging.info(dict(zip(unique, counts)))
                    logging.info('rec spkpost')
                    if outpad_rights[2] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[2]:], dim=-1), 1))
                    logging.info('rec spkpost_e')
                    if outpad_rights[2] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:-outpad_rights[2]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits_e[:,outpad_lefts[2]:], dim=-1), 1))

                    src_code = (torch.ones((1, lat_rec.shape[1]))*spk_idx).cuda().long()

                    if config.ar_dec:
                        cvmcep_cyc, _, _ = model_decoder_mcep(src_code, lat_rec, x_in=x_in)
                    else:
                        cvmcep_cyc, _ = model_decoder_mcep(src_code, lat_rec)
                    if config.ar_f0:
                        cvlf0_cyc, _, _ = model_decoder_excit(src_code, lat_rec_e, e_in=e_in)
                    else:
                        cvlf0_cyc, _ = model_decoder_excit(src_code, lat_rec_e)

                    if outpad_rights[1] > 0:
                        cvmcep_src = cvmcep_src[:,outpad_lefts[1]:-outpad_rights[1]]
                        cvlf0_src = cvlf0_src[:,outpad_lefts[1]:-outpad_rights[1]]
                    else:
                        cvmcep_src = cvmcep_src[:,outpad_lefts[1]:]
                        cvlf0_src = cvlf0_src[:,outpad_lefts[1]:]

                    feat_rec = torch.cat((torch.round(cvlf0_src[:,:,:1]), cvlf0_src[:,:,1:2], \
                                            torch.round(cvlf0_src[:,:,2:3]), cvlf0_src[:,:,3:], cvmcep_src), \
                                                2)[0].cpu().data.numpy()
                    feat_cyc = torch.cat((torch.round(cvlf0_cyc[:,:,:1]), cvlf0_cyc[:,:,1:2], \
                                            torch.round(cvlf0_cyc[:,:,2:3]), cvlf0_cyc[:,:,3:], cvmcep_cyc), \
                                                2)[0].cpu().data.numpy()

                    cvmcep_src = np.array(cvmcep_src[0].cpu().data.numpy(), dtype=np.float64)
                    cvlf0_src = np.array(cvlf0_src[0].cpu().data.numpy(), dtype=np.float64)

                    cvmcep_cyc = np.array(cvmcep_cyc[0].cpu().data.numpy(), dtype=np.float64)
                    cvlf0_cyc = np.array(cvlf0_cyc[0].cpu().data.numpy(), dtype=np.float64)

                logging.info(cvlf0_src.shape)
                logging.info(cvmcep_src.shape)

                logging.info(cvlf0_cyc.shape)
                logging.info(cvmcep_cyc.shape)

                mcep = np.array(feat_org[:,-model_decoder_mcep.out_dim:])
                f0 = np.array(np.rint(feat_org[:,0])*np.exp(feat_org[:,1]))
                codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:feat_org.shape[-1]-model_decoder_mcep.out_dim])))
 
                cvf0_src = np.array(np.rint(cvlf0_src[:,0])*np.exp(cvlf0_src[:,1]))
                cvcodeap_src = np.array(np.rint(cvlf0_src[:,2:3])*(-np.exp(cvlf0_src[:,3:])))
                f0_rmse = np.sqrt(np.mean((cvf0_src-f0)**2))
                logging.info('F0_rmse_rec: %lf Hz' % (f0_rmse))
                cvf0_src_mean = np.mean(cvf0_src)
                f0_mean = np.mean(f0)
                f0_corr = np.sum((cvf0_src-cvf0_src_mean)*(f0-f0_mean))/\
                            (np.sqrt(np.sum((cvf0_src-cvf0_src_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2)))
                logging.info('F0_corr_rec: %lf' % (f0_corr))

                codeap_rmse = np.sqrt(np.mean((cvcodeap_src-codeap)**2, axis=0))
                for i in range(codeap_rmse.shape[-1]):
                    logging.info('codeap-%d_rmse_rec: %lf dB' % (i+1, codeap_rmse[i]))

                cvf0_cyc = np.array(np.rint(cvlf0_cyc[:,0])*np.exp(cvlf0_cyc[:,1]))
                cvcodeap_cyc = np.array(np.rint(cvlf0_cyc[:,2:3])*(-np.exp(cvlf0_cyc[:,3:])))
                f0_rmse_cyc = np.sqrt(np.mean((cvf0_cyc-f0)**2))
                logging.info('F0_rmse_cyc: %lf Hz' % (f0_rmse_cyc))
                cvf0_cyc_mean = np.mean(cvf0_cyc)
                f0_mean = np.mean(f0)
                f0_corr_cyc = np.sum((cvf0_cyc-cvf0_cyc_mean)*(f0-f0_mean))/\
                            (np.sqrt(np.sum((cvf0_cyc-cvf0_cyc_mean)**2))*np.sqrt(np.sum((f0-f0_mean)**2)))
                logging.info('F0_corr_cyc: %lf' % (f0_corr_cyc))

                codeap_rmse_cyc = np.sqrt(np.mean((cvcodeap_cyc-codeap)**2, axis=0))
                for i in range(codeap_rmse_cyc.shape[-1]):
                    logging.info('codeap-%d_rmse_cyc: %lf dB' % (i+1, codeap_rmse_cyc[i]))

                spcidx = read_hdf5(feat_file, "/spcidx_range")[0]

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_rec: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std))

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean_cyc = np.mean(mcdpow_arr)
                mcdpow_std_cyc = np.std(mcdpow_arr)
                mcd_mean_cyc = np.mean(mcd_arr)
                mcd_std_cyc = np.std(mcd_arr)
                logging.info("mcdpow_cyc: %.6f dB +- %.6f" % (mcdpow_mean_cyc, mcdpow_std_cyc))
                logging.info("mcd_cyc: %.6f dB +- %.6f" % (mcd_mean_cyc, mcd_std_cyc))
            
                logging.info('org f0')
                logging.info(f0[10:15])
                logging.info('rec f0')
                logging.info(cvf0_src[10:15])
                logging.info('cyc f0')
                logging.info(cvf0_cyc[10:15])
                logging.info('org cap')
                logging.info(codeap[10:15])
                logging.info('rec cap')
                logging.info(cvcodeap_src[10:15])
                logging.info('cyc cap')
                logging.info(cvcodeap_cyc[10:15])

                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    f0rmse_cvlist.append(f0_rmse)
                    f0corr_cvlist.append(f0_corr)
                    caprmse_cvlist.append(codeap_rmse)
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    cvlist.append(np.var(cvmcep_src[:,1:], axis=0))
                    logging.info(len(cvlist))
                    f0rmse_cvlist_cyc.append(f0_rmse_cyc)
                    f0corr_cvlist_cyc.append(f0_corr_cyc)
                    caprmse_cvlist_cyc.append(codeap_rmse_cyc)
                    mcdpow_cvlist_cyc.append(mcdpow_mean_cyc)
                    mcdpow_cvlist_cyc.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc.append(mcd_std_cyc)
                    cvlist_cyc.append(np.var(cvmcep_cyc[:,1:], axis=0))
                elif 'dv' in dataset:
                    logging.info('dev')
                    f0rmse_cvlist_dv.append(f0_rmse)
                    f0corr_cvlist_dv.append(f0_corr)
                    caprmse_cvlist_dv.append(codeap_rmse)
                    mcdpow_cvlist_dv.append(mcdpow_mean)
                    mcdpowstd_cvlist_dv.append(mcdpow_std)
                    mcd_cvlist_dv.append(mcd_mean)
                    mcdstd_cvlist_dv.append(mcd_std)
                    cvlist_dv.append(np.var(cvmcep_src[:,1:], axis=0))
                    logging.info(len(cvlist_dv))
                    f0rmse_cvlist_cyc_dv.append(f0_rmse_cyc)
                    f0corr_cvlist_cyc_dv.append(f0_corr_cyc)
                    caprmse_cvlist_cyc_dv.append(codeap_rmse_cyc)
                    mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc)
                    mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc_dv.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc_dv.append(mcd_std_cyc)
                    cvlist_cyc_dv.append(np.var(cvmcep_cyc[:,1:], axis=0))

                logging.info('write rec to h5')
                outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_rec.shape)
                write_hdf5(feat_file, args.string_path, feat_rec)

                logging.info('write cyc to h5')
                outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk+"-"+args.spk+"-"+args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_cyc.shape)
                write_hdf5(feat_file, args.string_path, feat_cyc)

                count += 1
コード例 #4
0
    def decode_RNN(feat_list, gpu, cvlist=None,
            mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\
            mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\
            mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \
            lat_dist_rmse_list=None, lat_dist_cosim_list=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim+config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder)
                model_decoder = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_dec)
                logging.info(model_decoder)
                model_post = GRU_POST_NET(
                    spec_dim=config.mcep_dim,
                    excit_dim=2,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_post,
                    hidden_units=config.hidden_units_post,
                    kernel_size=config.kernel_size_post,
                    dilation_size=config.dilation_size_post,
                    causal_conv=config.causal_conv_post,
                    pad_first=True,
                    right_size=config.right_size_post)
                    #excit_dim=config.excit_dim,
                    #excit_dim=None,
                logging.info(model_post)
                model_encoder.load_state_dict(torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(torch.load(args.model)["model_decoder"])
                model_post.load_state_dict(torch.load(args.model)["model_post"])
                model_encoder.remove_weight_norm()
                model_decoder.remove_weight_norm()
                model_post.remove_weight_norm()
                model_encoder.cuda()
                model_decoder.cuda()
                model_post.cuda()
                model_encoder.eval()
                model_decoder.eval()
                model_post.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                for param in model_post.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2
            pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2
            outpad_lefts = [None]*5
            outpad_rights = [None]*5
            outpad_lefts[0] = pad_left-model_encoder.pad_left
            outpad_rights[0] = pad_right-model_encoder.pad_right
            outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left
            outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right
            outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left
            outpad_rights[2] = outpad_rights[1]-model_post.pad_right
            outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left
            outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right
            outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left
            outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right
            logging.info(f'{pad_left} {pad_right}')
            logging.info(outpad_lefts)
            logging.info(outpad_rights)
            for feat_file in feat_list:
                # convert mcep
                spk_src = os.path.basename(os.path.dirname(feat_file))
                src_idx = spk_list.index(spk_src)
                logging.info('%s --> %s' % (spk_src, args.spk_trg))

                file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file))
                trg_exist = False
                if os.path.exists(file_trg):
                    logging.info('exist: %s' % (file_trg))
                    feat_trg = read_hdf5(file_trg, config.string_path)
                    mcep_trg = feat_trg[:,-config.mcep_dim:]
                    logging.info(mcep_trg.shape)
                    trg_exist = True

                feat_org = read_hdf5(feat_file, config.string_path)
                mcep = np.array(feat_org[:,-config.mcep_dim:])
                codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim])))
                sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl))
                ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl)
                feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1)
                feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]]

                logging.info("generate")
                with torch.no_grad():
                    feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2)
                    feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0)
                    feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0)

                    spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1))

                    if trg_exist:
                        spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \
                                                            (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False)
                        logging.info('target spkpost')
                        logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1))

                    cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src)
                    cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('rec spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src)
                    cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(),
                                        e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('cv spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv)
                    cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))

                    if outpad_rights[2] > 0:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                    else:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                    cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64)

                    if trg_exist:
                        if outpad_rights[0] > 0:
                            lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]]
                        else:
                            lat_src = lat_src[:,outpad_lefts[0]:]

                logging.info(cvmcep_src.shape)
                logging.info(cvmcep.shape)
                logging.info(cvmcep_cyc.shape)

                if trg_exist:
                    logging.info(lat_src.shape)
                    logging.info(lat_trg.shape)
 
                cvlist.append(np.var(cvmcep[:,1:], axis=0))

                logging.info("cvf0lin")
                f0_range = read_hdf5(feat_file, "/f0_range")
                cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std)
                uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin))
                unique, counts = np.unique(uv_range_lin, return_counts=True)
                logging.info(dict(zip(unique, counts)))
                cont_f0_lpf_range_lin = \
                    low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20)
                uv_range_lin = np.expand_dims(uv_range_lin, axis=-1)
                cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1)
                # plain converted feat for neural vocoder
                feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep]
                logging.info(feat_cv.shape)

                logging.info("mcd acc")
                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_src.append(mcdpow_mean)
                mcdpowstd_cvlist_src.append(mcdpow_std)
                mcd_cvlist_src.append(mcd_mean)
                mcdstd_cvlist_src.append(mcd_std)
                if trg_exist:
                    spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0])
                    _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64))
                    _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64))
                    mcdpow_mean = np.mean(mcdpow_arr)
                    mcdpow_std = np.std(mcdpow_arr)
                    mcd_mean = np.mean(mcd_arr)
                    mcd_std = np.std(mcd_arr)
                    logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                    logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    spcidx_src = torch.LongTensor(spcidx).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()
                    trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64)
                    trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64)
                    aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg)
                    lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0)))
                    _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0)
                    aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src)
                    lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0)))
                    _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0)
                    logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc))
                    lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2
                    lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2
                    lat_dist_rmse_list.append(lat_dist_rmse)
                    lat_dist_cosim_list.append(lat_dist_cosim)
                    logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim))
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_cyc.append(mcdpow_mean)
                mcdpowstd_cvlist_cyc.append(mcdpow_std)
                mcd_cvlist_cyc.append(mcd_mean)
                mcdstd_cvlist_cyc.append(mcd_std)

                logging.info("synth anasyn")
                wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco rec")
                cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl)
                logging.info(cvsp_src.shape)
                wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv")
                cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl)
                logging.info(cvsp.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv GV")
                datamean = np.mean(cvmcep[:,1:], axis=0)
                cvmcep_gv =  np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \
                                    (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]]
                cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN)
                cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl)
                logging.info(cvsp_gv.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                #logging.info("synth diffGV")
                #shiftl = int(args.fs/1000*args.shiftms)
                #mc_cv_diff = cvmcep_gv-mcep
                #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha)
                #logging.info(b.shape)
                #assert np.isfinite(b).all
                #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl)
                #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav")))
                #assert(fs_ == args.fs)
                #wav = mlsa_fil.synthesis(x, b)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("synth diffGVF0")
                #time_axis = read_hdf5(feat_file, "/time_axis")
                #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff.shape)
                #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(ap_diff.shape)
                #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("analysis diffGVF0")
                #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff_anasyn.shape)
                #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha)
                #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs)
                ## convert to continouos codeap with uv
                #for i in range(code_ap_diff_anasyn.shape[-1]):
                #    logging.info('codeap: %d' % (i+1))
                #    uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i]))
                #    cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP))
                #    if i > 0:
                #        cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)]
                #    else:
                #        uv_codeap = np.expand_dims(uv_codeap_i, axis=-1)
                #        cont_codeap = np.expand_dims(cont_codeap_i, axis=-1)
                #    uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1)
                #    unique, counts = np.unique(uv_codeap_i, return_counts=True)
                #    logging.info(dict(zip(unique, counts)))
                ## postprocessed converted feat for neural vocoder
                #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn]

                #logging.info("write lat")
                #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file)))
                #if not os.path.exists(outTxtDir):
                #    os.mkdir(outTxtDir)
                #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt"))
                #logging.info(outTxt)
                #g = open(outTxt, "wt")
                #idx_frm = 0 
                #nfrm = trj_lat_src.shape[0]
                #dim = trj_lat_src.shape[1]
                #if not args.time_flag:
                ##if True:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                g.write("%lf " % (elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #else:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                if idx_elmt > 1:
                #                    g.write("%lf " % (elmt))
                #                else:
                #                    g.write("%lf %lf " % (time_axis[idx_frm], elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #g.close()

                logging.info('write to h5')
                outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                # cv
                write_path = args.string_path
                logging.info(feat_file + ' ' + write_path)
                logging.info(feat_cv.shape)
                write_hdf5(feat_file, write_path, feat_cv)
                ## diffGVF0
                #write_path = args.string_path+"_diffgvf0"
                #logging.info(feat_file + ' ' + write_path)
                #logging.info(feat_diffgv_anasyn.shape)
                #write_hdf5(feat_file, write_path, feat_diffgv_anasyn)

                count += 1
コード例 #5
0
    def gpu_decode(feat_list, gpu, cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, mcd_cvlist=None, \
                    mcdstd_cvlist=None, cvlist_dv=None, mcdpow_cvlist_dv=None, mcdpowstd_cvlist_dv=None, \
                    mcd_cvlist_dv=None, mcdstd_cvlist_dv=None, \
                    cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None, mcd_cvlist_cyc=None, \
                    mcdstd_cvlist_cyc=None, cvlist_cyc_dv=None, mcdpow_cvlist_cyc_dv=None, mcdpowstd_cvlist_cyc_dv=None, \
                    mcd_cvlist_cyc_dv=None, mcdstd_cvlist_cyc_dv=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim + config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder)
                model_decoder = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_dec)
                logging.info(model_decoder)
                model_post = GRU_POST_NET(
                    spec_dim=config.mcep_dim,
                    excit_dim=2,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_post,
                    hidden_units=config.hidden_units_post,
                    kernel_size=config.kernel_size_post,
                    dilation_size=config.dilation_size_post,
                    causal_conv=config.causal_conv_post,
                    pad_first=True,
                    right_size=config.right_size_post)
                #excit_dim=config.excit_dim,
                #excit_dim=None,
                logging.info(model_post)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_post.load_state_dict(
                    torch.load(args.model)["model_post"])
                model_encoder.remove_weight_norm()
                model_decoder.remove_weight_norm()
                model_post.remove_weight_norm()
                model_encoder.cuda()
                model_decoder.cuda()
                model_post.cuda()
                model_encoder.eval()
                model_decoder.eval()
                model_post.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                for param in model_post.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder.pad_left + model_decoder.pad_left +
                        model_post.pad_left) * 2
            pad_right = (model_encoder.pad_right + model_decoder.pad_right +
                         model_post.pad_right) * 2
            outpad_lefts = [None] * 5
            outpad_rights = [None] * 5
            outpad_lefts[0] = pad_left - model_encoder.pad_left
            outpad_rights[0] = pad_right - model_encoder.pad_right
            outpad_lefts[1] = outpad_lefts[0] - model_decoder.pad_left
            outpad_rights[1] = outpad_rights[0] - model_decoder.pad_right
            outpad_lefts[2] = outpad_lefts[1] - model_post.pad_left
            outpad_rights[2] = outpad_rights[1] - model_post.pad_right
            outpad_lefts[3] = outpad_lefts[2] - model_encoder.pad_left
            outpad_rights[3] = outpad_rights[2] - model_encoder.pad_right
            outpad_lefts[4] = outpad_lefts[3] - model_decoder.pad_left
            outpad_rights[4] = outpad_rights[3] - model_decoder.pad_right
            logging.info(f'{pad_left} {pad_right}')
            logging.info(outpad_lefts)
            logging.info(outpad_rights)
            for feat_file in feat_list:
                # convert mcep
                logging.info("recmcep " + feat_file)

                feat_org = read_hdf5(feat_file, "/feat_mceplf0cap")
                logging.info(feat_org.shape)
                mcep = np.array(feat_org[:, -config.mcep_dim:])

                with torch.no_grad():
                    feat = F.pad(
                        torch.FloatTensor(feat_org).cuda().unsqueeze(
                            0).transpose(1, 2), (pad_left, pad_right),
                        "replicate").transpose(1, 2)
                    feat_excit = torch.FloatTensor(
                        feat_org[:, :config.excit_dim]).cuda().unsqueeze(0)

                    spk_logits, _, lat_src, _ = model_encoder(feat,
                                                              sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:
                                                     -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:],
                                          dim=-1), 1))

                    cvmcep_src, _ = model_decoder((torch.ones(
                        (1, lat_src.shape[1])) * spk_idx).cuda().long(),
                                                  lat_src)
                    cvmcep_src_post, _ = model_post(
                        cvmcep_src,
                        y=(torch.ones(
                            (1, cvmcep_src.shape[1])) * spk_idx).cuda().long(),
                        e=F.pad(feat_excit[:, :, :2].transpose(1, 2),
                                (outpad_lefts[1], outpad_rights[1]),
                                "replicate").transpose(1, 2))
                    #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2),
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2),
                                                            sampling=False)
                    logging.info('rec spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[3]:
                                                     -outpad_rights[3]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[3]:],
                                          dim=-1), 1))

                    cvmcep_cyc, _ = model_decoder((torch.ones(
                        (1, lat_rec.shape[1])) * spk_idx).cuda().long(),
                                                  lat_rec)
                    cvmcep_cyc_post, _ = model_post(
                        cvmcep_cyc,
                        y=(torch.ones(
                            (1, cvmcep_cyc.shape[1])) * spk_idx).cuda().long(),
                        e=F.pad(feat_excit[:, :, :2].transpose(1, 2),
                                (outpad_lefts[4], outpad_rights[4]),
                                "replicate").transpose(1, 2))
                    #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))

                    if outpad_rights[2] > 0:
                        feat_rec = torch.cat(
                            (feat_excit,
                             cvmcep_src_post[:,
                                             outpad_lefts[2]:-outpad_rights[2]]
                             ), 2)[0].cpu().data.numpy()
                    else:
                        feat_rec = torch.cat(
                            (feat_excit, cvmcep_src_post[:, outpad_lefts[2]:]),
                            2)[0].cpu().data.numpy()
                    feat_cyc = torch.cat((feat_excit, cvmcep_cyc_post),
                                         2)[0].cpu().data.numpy()

                    if outpad_rights[2] > 0:
                        cvmcep_src = np.array(
                            cvmcep_src_post[:,
                                            outpad_lefts[2]:-outpad_rights[2]]
                            [0].cpu().data.numpy(),
                            dtype=np.float64)
                    else:
                        cvmcep_src = np.array(cvmcep_src_post[:,
                                                              outpad_lefts[2]:]
                                              [0].cpu().data.numpy(),
                                              dtype=np.float64)
                    cvmcep_cyc = np.array(
                        cvmcep_cyc_post[0].cpu().data.numpy(),
                        dtype=np.float64)

                logging.info(cvmcep_src.shape)
                logging.info(cvmcep_cyc.shape)

                spcidx = read_hdf5(feat_file, "/spcidx_range")[0]

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_src[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_rec: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_rec: %.6f dB +- %.6f" % (mcd_mean, mcd_std))

                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),:], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep_cyc[np.array(spcidx),1:], \
                                            dtype=np.float64), np.array(mcep[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean_cyc = np.mean(mcdpow_arr)
                mcdpow_std_cyc = np.std(mcdpow_arr)
                mcd_mean_cyc = np.mean(mcd_arr)
                mcd_std_cyc = np.std(mcd_arr)
                logging.info("mcdpow_cyc: %.6f dB +- %.6f" %
                             (mcdpow_mean_cyc, mcdpow_std_cyc))
                logging.info("mcd_cyc: %.6f dB +- %.6f" %
                             (mcd_mean_cyc, mcd_std_cyc))

                dataset = feat_file.split('/')[1].split('_')[0]
                if 'tr' in dataset:
                    logging.info('trn')
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    cvlist.append(np.var(cvmcep_src[:, 1:], axis=0))
                    logging.info(len(cvlist))
                    mcdpow_cvlist_cyc.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc.append(mcd_std_cyc)
                    cvlist_cyc.append(np.var(cvmcep_cyc[:, 1:], axis=0))
                elif 'dv' in dataset:
                    logging.info('dev')
                    mcdpow_cvlist_dv.append(mcdpow_mean)
                    mcdpowstd_cvlist_dv.append(mcdpow_std)
                    mcd_cvlist_dv.append(mcd_mean)
                    mcdstd_cvlist_dv.append(mcd_std)
                    cvlist_dv.append(np.var(cvmcep_src[:, 1:], axis=0))
                    logging.info(len(cvlist_dv))
                    mcdpow_cvlist_cyc_dv.append(mcdpow_mean_cyc)
                    mcdpowstd_cvlist_cyc_dv.append(mcdpow_std_cyc)
                    mcd_cvlist_cyc_dv.append(mcd_mean_cyc)
                    mcdstd_cvlist_cyc_dv.append(mcd_std_cyc)
                    cvlist_cyc_dv.append(np.var(cvmcep_cyc[:, 1:], axis=0))

                logging.info('write rec to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_rec.shape)
                write_hdf5(feat_file, args.string_path, feat_rec)

                logging.info('write cyc to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    args.spk + "-" + args.spk + "-" + args.spk)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                logging.info(feat_file + ' ' + args.string_path)
                logging.info(feat_cyc.shape)
                write_hdf5(feat_file, args.string_path, feat_cyc)

                count += 1
コード例 #6
0
    def gpu_decode(feat_list,
                   feat_trg_list,
                   gpu,
                   cvlist=None,
                   mcdlist=None,
                   mcdstdlist=None,
                   mcdpowlist=None,
                   mcdpowstdlist=None,
                   cvlist_src=None,
                   mcdlist_src=None,
                   mcdstdlist_src=None,
                   mcdpowlist_src=None,
                   mcdpowstdlist_src=None,
                   cvlist_trg=None,
                   mcdlist_trg=None,
                   mcdstdlist_trg=None,
                   mcdpowlist_trg=None,
                   mcdpowstdlist_trg=None,
                   lat_dist_rmse_enc_list=None,
                   lat_dist_cosim_enc_list=None,
                   lat_dist_rmse_pri_list=None,
                   lat_dist_cosim_pri_list=None):
        with torch.cuda.device(gpu):
            mean_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0_jnt")[config.stdim:]).cuda()
            std_jnt = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0_jnt")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info("model")
            logging.info(config)
            with torch.no_grad():
                model_encoder = GRU_RNN(in_dim=config.in_dim,
                                        out_dim=config.lat_dim * 2,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + 2,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size,
                                        dilation_size=config.dilation_size,
                                        scale_in_flag=False)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                logging.info(model_encoder)
                logging.info(model_decoder)
                init_pp = np.zeros((1, 1, config.lat_dim * 2))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = y_in_trg = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_jnt) / std_jnt, 0), 0)
            for feat_file, feat_trg_file in zip(feat_list, feat_trg_list):
                # convert mcep
                logging.info("cvmcep " + feat_file + " " + feat_trg_file)

                feat = read_hdf5(feat_file, "/feat_org_lf0")
                feat_trg = read_hdf5(feat_trg_file, "/feat_org_lf0")
                logging.info(feat.shape)
                logging.info(feat_trg.shape)
                with torch.no_grad():
                    lat_src, _, _ = model_encoder(
                        torch.FloatTensor(feat).cuda(),
                        y_in_pp,
                        clamp_vae=True,
                        lat_dim=config.lat_dim)
                    lat_feat = sampling_vae_batch(lat_src.unsqueeze(0).repeat(
                        args.n_smpl_dec, 1, 1),
                                                  lat_dim=config.lat_dim)
                    lat_feat = torch.mean(lat_feat, 0)
                    lat_trg, _, _ = model_encoder(
                        torch.FloatTensor(feat_trg).cuda(),
                        y_in_pp,
                        clamp_vae=True,
                        lat_dim=config.lat_dim)
                    lat_feat_trg = sampling_vae_batch(
                        lat_trg.unsqueeze(0).repeat(args.n_smpl_dec, 1, 1),
                        lat_dim=config.lat_dim)
                    lat_feat_trg = torch.mean(lat_feat_trg, 0)
                    src_code = np.zeros((lat_feat.shape[0], 2))
                    trg_code = np.zeros((lat_feat.shape[0], 2))
                    trg_trg_code = np.zeros((lat_feat_trg.shape[0], 2))
                    src_code[:, 0] = 1
                    trg_code[:, 1] = 1
                    trg_trg_code[:, 1] = 1
                    src_code = torch.FloatTensor(src_code).cuda()
                    trg_code = torch.FloatTensor(trg_code).cuda()
                    trg_trg_code = torch.FloatTensor(trg_trg_code).cuda()
                    cvmcep, _, _ = model_decoder(
                        torch.cat((trg_code, lat_feat), 1), y_in_trg)
                    cvmcep = np.array(cvmcep.cpu().data.numpy(),
                                      dtype=np.float64)
                    cvmcep_src, _, _ = model_decoder(
                        torch.cat((src_code, lat_feat), 1), y_in_src)
                    cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(),
                                          dtype=np.float64)
                    cvmcep_trg, _, _ = model_decoder(
                        torch.cat((trg_trg_code, lat_feat_trg), 1), y_in_trg)
                    cvmcep_trg = np.array(cvmcep_trg.cpu().data.numpy(),
                                          dtype=np.float64)

                logging.info(cvmcep.shape)
                logging.info(cvmcep_trg.shape)
                cvlist.append(np.var(cvmcep[:, 1:], axis=0))
                cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0))
                cvlist_trg.append(np.var(cvmcep_trg[:, 1:], axis=0))
                logging.info(len(cvlist))

                spcidx_src = read_hdf5(feat_file, "/spcidx_range")[0]
                mcep_trg = read_hdf5(feat_trg_file, "/mcepspc_range")
                _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(
                    np.array(cvmcep[np.array(spcidx_src), :],
                             dtype=np.float64),
                    np.array(mcep_trg[:, :], dtype=np.float64))
                _, _, _, mcd_arr = dtw.dtw_org_to_trg(
                    np.array(cvmcep[np.array(spcidx_src), 1:],
                             dtype=np.float64),
                    np.array(mcep_trg[:, 1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist.append(mcdpow_mean)
                mcdpowstdlist.append(mcdpow_std)
                mcdlist.append(mcd_mean)
                mcdstdlist.append(mcd_std)

                mcep_src = read_hdf5(feat_file, "/mcepspc_range")
                _, mcdpow_arr = dtw.calc_mcd(
                    np.array(mcep_src[:, :], dtype=np.float64),
                    np.array(cvmcep_src[np.array(spcidx_src), :],
                             dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(
                    np.array(mcep_src[:, 1:], dtype=np.float64),
                    np.array(cvmcep_src[np.array(spcidx_src), 1:],
                             dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist_src.append(mcdpow_mean)
                mcdpowstdlist_src.append(mcdpow_std)
                mcdlist_src.append(mcd_mean)
                mcdstdlist_src.append(mcd_std)

                spcidx_trg = read_hdf5(feat_trg_file, "/spcidx_range")[0]
                _, mcdpow_arr = dtw.calc_mcd(
                    np.array(mcep_trg[:, :], dtype=np.float64),
                    np.array(cvmcep_trg[np.array(spcidx_trg), :],
                             dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(
                    np.array(mcep_trg[:, 1:], dtype=np.float64),
                    np.array(cvmcep_trg[np.array(spcidx_trg), 1:],
                             dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_trg: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpowlist_trg.append(mcdpow_mean)
                mcdpowstdlist_trg.append(mcdpow_std)
                mcdlist_trg.append(mcd_mean)
                mcdstdlist_trg.append(mcd_std)

                with torch.no_grad():
                    spcidx_src = torch.LongTensor(spcidx_src).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()

                trj_lat_src = np.array(torch.index_select(
                    lat_src, 0, spcidx_src).cpu().data.numpy(),
                                       dtype=np.float64)
                trj_lat_trg = np.array(torch.index_select(
                    lat_trg, 0, spcidx_trg).cpu().data.numpy(),
                                       dtype=np.float64)
                aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_src, trj_lat_trg)
                lat_dist_srctrg = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                axis=0)))
                _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                               trj_lat_src,
                                                               mcd=0)
                aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_trg, trj_lat_src)
                lat_dist_trgsrc = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                axis=0)))
                _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                               trj_lat_trg,
                                                               mcd=0)
                logging.info("%lf %lf %lf %lf" %
                             (lat_dist_srctrg, lat_cdist_srctrg,
                              lat_dist_trgsrc, lat_cdist_trgsrc))
                lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                lat_dist_rmse_enc_list.append(lat_dist_rmse)
                lat_dist_cosim_enc_list.append(lat_dist_cosim)
                logging.info("lat_dist_enc: %.6f %.6f" %
                             (lat_dist_rmse, lat_dist_cosim))

                trj_lat_src = np.array(torch.index_select(
                    lat_feat, 0, spcidx_src).cpu().data.numpy(),
                                       dtype=np.float64)
                trj_lat_trg = np.array(torch.index_select(
                    lat_feat_trg, 0, spcidx_trg).cpu().data.numpy(),
                                       dtype=np.float64)
                aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_src, trj_lat_trg)
                lat_dist_srctrg = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                axis=0)))
                _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                               trj_lat_src,
                                                               mcd=0)
                aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                    trj_lat_trg, trj_lat_src)
                lat_dist_trgsrc = np.mean(
                    np.sqrt(
                        np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                axis=0)))
                _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                               trj_lat_trg,
                                                               mcd=0)
                logging.info("%lf %lf %lf %lf" %
                             (lat_dist_srctrg, lat_cdist_srctrg,
                              lat_dist_trgsrc, lat_cdist_trgsrc))
                lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                lat_dist_rmse_pri_list.append(lat_dist_rmse)
                lat_dist_cosim_pri_list.append(lat_dist_cosim)
                logging.info("lat_dist_pri: %.6f %.6f" %
                             (lat_dist_rmse, lat_dist_cosim))
コード例 #7
0
    def decode_RNN(feat_list,
                   gpu,
                   cvlist=None,
                   lsd_cvlist_src=None,
                   lsdstd_cvlist_src=None,
                   lsd_cvlist_cyc=None,
                   lsdstd_cvlist_cyc=None,
                   lsd_cvlist=None,
                   lsdstd_cvlist=None,
                   lat_dist_rmse_list=None,
                   lat_dist_cosim_list=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder_melsp = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_melsp)
                model_decoder_melsp = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim + config.lat_dim_e,
                    out_dim=config.mel_dim,
                    n_spk=(config.emb_spk_dim // config.n_weight_emb) *
                    config.n_weight_emb,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    pad_first=True,
                    right_size=config.right_size_dec,
                    red_dim_upd=config.mel_dim,
                    pdf_gauss=True)
                logging.info(model_decoder_melsp)
                model_encoder_excit = GRU_VAE_ENCODER(
                    in_dim=config.mel_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim_e,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder_excit)
                model_spkidtr = SPKID_TRANSFORM_LAYER(
                    n_spk=n_spk,
                    emb_dim=config.emb_spk_dim,
                    n_weight_emb=config.n_weight_emb,
                    conv_emb_flag=True,
                    spkidtr_dim=config.spkidtr_dim)
                logging.info(model_spkidtr)
                model_encoder_melsp.load_state_dict(
                    torch.load(args.model)["model_encoder_melsp"])
                model_decoder_melsp.load_state_dict(
                    torch.load(args.model)["model_decoder_melsp"])
                model_encoder_excit.load_state_dict(
                    torch.load(args.model)["model_encoder_excit"])
                model_spkidtr.load_state_dict(
                    torch.load(args.model)["model_spkidtr"])
                model_encoder_melsp.cuda()
                model_decoder_melsp.cuda()
                model_encoder_excit.cuda()
                model_spkidtr.cuda()
                model_encoder_melsp.eval()
                model_decoder_melsp.eval()
                model_encoder_excit.eval()
                model_spkidtr.eval()
                model_encoder_melsp.remove_weight_norm()
                model_decoder_melsp.remove_weight_norm()
                model_encoder_excit.remove_weight_norm()
                model_spkidtr.remove_weight_norm()
                for param in model_encoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_decoder_melsp.parameters():
                    param.requires_grad = False
                for param in model_encoder_excit.parameters():
                    param.requires_grad = False
                for param in model_spkidtr.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder_melsp.pad_left +
                        model_decoder_melsp.pad_left) * 2
            pad_right = (model_encoder_melsp.pad_right +
                         model_decoder_melsp.pad_right) * 2
            outpad_lefts = [None] * 3
            outpad_rights = [None] * 3
            outpad_lefts[0] = pad_left - model_encoder_melsp.pad_left
            outpad_rights[0] = pad_right - model_encoder_melsp.pad_right
            outpad_lefts[1] = outpad_lefts[0] - model_decoder_melsp.pad_left
            outpad_rights[1] = outpad_rights[0] - model_decoder_melsp.pad_right
            outpad_lefts[2] = outpad_lefts[1] - model_encoder_melsp.pad_left
            outpad_rights[2] = outpad_rights[1] - model_encoder_melsp.pad_right
            melfb_t = np.linalg.pinv(
                librosa.filters.mel(args.fs, args.fftl, n_mels=config.mel_dim))
            temp = 0.675
            logging.info(f'temp: {temp}')
            for feat_file in feat_list:
                # convert melsp
                spk_src = os.path.basename(os.path.dirname(feat_file))
                logging.info('%s --> %s' % (spk_src, args.spk_trg))

                file_trg = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)), args.spk_trg,
                    os.path.basename(feat_file))
                trg_exist = False
                if os.path.exists(file_trg):
                    logging.info('exist: %s' % (file_trg))
                    feat_trg = read_hdf5(file_trg, "/log_1pmelmagsp")
                    logging.info(feat_trg.shape)
                    trg_exist = True

                feat_org = read_hdf5(feat_file, "/log_1pmelmagsp")
                logging.info(feat_org.shape)

                logging.info("generate")
                with torch.no_grad():
                    feat = F.pad(
                        torch.FloatTensor(feat_org).cuda().unsqueeze(
                            0).transpose(1, 2), (pad_left, pad_right),
                        "replicate").transpose(1, 2)

                    spk_logits, _, lat_src, _ = model_encoder_melsp(
                        feat, sampling=False)
                    spk_logits_e, _, lat_src_e, _ = model_encoder_excit(
                        feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:
                                                     -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[0]:],
                                          dim=-1), 1))
                    logging.info('input spkpost_e')
                    if outpad_rights[0] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:
                                                       -outpad_rights[0]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[0]:],
                                          dim=-1), 1))

                    if trg_exist:
                        spk_trg_logits, _, lat_trg, _ = model_encoder_melsp(
                            F.pad(
                                torch.FloatTensor(feat_trg).cuda().unsqueeze(
                                    0).transpose(1, 2),
                                (model_encoder_melsp.pad_left,
                                 model_encoder_melsp.pad_right),
                                "replicate").transpose(1, 2),
                            sampling=False)
                        spk_trg_logits_e, _, lat_trg_e, _ = model_encoder_excit(
                            F.pad(
                                torch.FloatTensor(feat_trg).cuda().unsqueeze(
                                    0).transpose(1, 2),
                                (model_encoder_excit.pad_left,
                                 model_encoder_excit.pad_right),
                                "replicate").transpose(1, 2),
                            sampling=False)
                        logging.info('target spkpost')
                        logging.info(
                            torch.mean(F.softmax(spk_trg_logits, dim=-1), 1))
                        logging.info('target spkpost_e')
                        logging.info(
                            torch.mean(F.softmax(spk_trg_logits_e, dim=-1), 1))

                    _, src_code = model_spkidtr((torch.ones(
                        (1, lat_src_e.shape[1])) * src_idx).cuda().long())
                    _, trg_code = model_spkidtr((torch.ones(
                        (1, lat_src_e.shape[1])) * trg_idx).cuda().long())
                    lat_cat = torch.cat((lat_src_e, lat_src), 2)

                    _, cvmelsp_src, _ = model_decoder_melsp(lat_cat,
                                                            y=src_code,
                                                            temp=temp)
                    _, cvmelsp, _ = model_decoder_melsp(lat_cat,
                                                        y=trg_code,
                                                        temp=temp)
                    trj_lat_cat = lat_cat_src = lat_cat

                    spk_logits, _, lat_cv, _ = model_encoder_melsp(
                        cvmelsp, sampling=False)
                    spk_logits_e, _, lat_cv_e, _ = model_encoder_excit(
                        cvmelsp, sampling=False)
                    logging.info('cv spkpost')
                    if outpad_rights[2] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:
                                                     -outpad_rights[2]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits[:, outpad_lefts[2]:],
                                          dim=-1), 1))
                    logging.info('cv spkpost_e')
                    if outpad_rights[2] > 0:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[2]:
                                                       -outpad_rights[2]],
                                          dim=-1), 1))
                    else:
                        logging.info(
                            torch.mean(
                                F.softmax(spk_logits_e[:, outpad_lefts[2]:],
                                          dim=-1), 1))

                    _, src_code = model_spkidtr((torch.ones(
                        (1, lat_cv_e.shape[1])) * src_idx).cuda().long())
                    lat_cat = torch.cat((lat_cv_e, lat_cv), 2)

                    _, cvmelsp_cyc, _ = model_decoder_melsp(lat_cat,
                                                            y=src_code,
                                                            temp=temp)

                    #if outpad_rights[0] > 0:
                    #    trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]]
                    #else:
                    #    trj_lat_cat = trj_lat_cat[:,outpad_lefts[0]:-outpad_rights[0]]
                    if outpad_rights[1] > 0:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:
                                                  -outpad_rights[1]]
                        cvmelsp = cvmelsp[:, outpad_lefts[1]:-outpad_rights[1]]
                    else:
                        cvmelsp_src = cvmelsp_src[:, outpad_lefts[1]:]
                        cvmelsp = cvmelsp[:, outpad_lefts[1]:]

                    feat_cv = cvmelsp[0].cpu().data.numpy()
                    #feat_lat = trj_lat_cat[0].cpu().data.numpy()

                    cvmelsp_src = np.array(cvmelsp_src[0].cpu().data.numpy(),
                                           dtype=np.float64)
                    cvmelsp = np.array(cvmelsp[0].cpu().data.numpy(),
                                       dtype=np.float64)
                    cvmelsp_cyc = np.array(cvmelsp_cyc[0].cpu().data.numpy(),
                                           dtype=np.float64)

                    if trg_exist:
                        if outpad_rights[1] > 0:
                            lat_src = lat_cat_src[:, outpad_lefts[0]:
                                                  -outpad_rights[0]]
                        else:
                            lat_src = lat_cat_src[:, outpad_lefts[0]:]
                        lat_trg = torch.cat((lat_trg_e, lat_trg), 2)

                logging.info(cvmelsp_src.shape)
                logging.info(cvmelsp.shape)
                logging.info(cvmelsp_cyc.shape)

                melsp = np.array(feat_org)

                if trg_exist:
                    logging.info(lat_src.shape)
                    logging.info(lat_trg.shape)
                    melsp_trg = np.array(feat_trg)

                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])

                melsp_rest = (np.exp(melsp) - 1) / 10000
                melsp_cv_rest = (np.exp(cvmelsp) - 1) / 10000
                melsp_src_rest = (np.exp(cvmelsp_src) - 1) / 10000
                melsp_cyc_rest = (np.exp(cvmelsp_cyc) - 1) / 10000

                cvlist.append(np.var(melsp_cv_rest, axis=0))

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_src_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean = np.mean(lsd_arr)
                lsd_std = np.std(lsd_arr)
                logging.info("lsd_src_cv: %.6f dB +- %.6f" %
                             (lsd_mean, lsd_std))
                lsd_cvlist_src.append(lsd_mean)
                lsdstd_cvlist_src.append(lsd_std)

                if trg_exist:
                    melsp_trg_rest = (np.exp(melsp_trg) - 1) / 10000

                    spcidx_trg = np.array(
                        read_hdf5(file_trg, "/spcidx_range")[0])

                    _, twf_melsp, _, _ = dtw.dtw_org_to_trg(np.array(melsp_cv_rest[spcidx], \
                                                dtype=np.float64), np.array(melsp_trg_rest[spcidx_trg], dtype=np.float64), mcd=-1)
                    twf_melsp = np.array(twf_melsp[:, 0])
                    lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cv_rest[twf_melsp], a_min=1e-16, a_max=None))\
                                                             -np.log10(np.clip(melsp_rest[twf_melsp], a_min=1e-16, a_max=None))))**2, axis=-1))
                    lsd_mean = np.mean(lsd_arr)
                    lsd_std = np.std(lsd_arr)
                    logging.info("lsd_trg: %.6f dB +- %.6f" %
                                 (lsd_mean, lsd_std))
                    lsd_cvlist.append(lsd_mean)
                    lsdstd_cvlist.append(lsd_std)

                    spcidx_src = torch.LongTensor(spcidx).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()

                    trj_lat_src = np.array(torch.index_select(
                        lat_src[0], 0, spcidx_src).cpu().data.numpy(),
                                           dtype=np.float64)
                    trj_lat_trg = np.array(torch.index_select(
                        lat_trg[0], 0, spcidx_trg).cpu().data.numpy(),
                                           dtype=np.float64)
                    aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(
                        trj_lat_src, trj_lat_trg)
                    lat_dist_srctrg = np.mean(
                        np.sqrt(
                            np.mean((aligned_lat_srctrg - trj_lat_trg)**2,
                                    axis=0)))
                    _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg,
                                                                   trj_lat_src,
                                                                   mcd=0)
                    aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(
                        trj_lat_trg, trj_lat_src)
                    lat_dist_trgsrc = np.mean(
                        np.sqrt(
                            np.mean((aligned_lat_trgsrc - trj_lat_src)**2,
                                    axis=0)))
                    _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src,
                                                                   trj_lat_trg,
                                                                   mcd=0)
                    logging.info("%lf %lf %lf %lf" %
                                 (lat_dist_srctrg, lat_cdist_srctrg,
                                  lat_dist_trgsrc, lat_cdist_trgsrc))
                    lat_dist_rmse = (lat_dist_srctrg + lat_dist_trgsrc) / 2
                    lat_dist_cosim = (lat_cdist_srctrg + lat_cdist_trgsrc) / 2
                    lat_dist_rmse_list.append(lat_dist_rmse)
                    lat_dist_cosim_list.append(lat_dist_cosim)
                    logging.info("lat_dist: %.6f %.6f" %
                                 (lat_dist_rmse, lat_dist_cosim))

                lsd_arr = np.sqrt(np.mean((20*(np.log10(np.clip(melsp_cyc_rest[spcidx], a_min=1e-16, a_max=None))\
                                                         -np.log10(np.clip(melsp_rest[spcidx], a_min=1e-16, a_max=None))))**2, axis=-1))
                lsd_mean_cyc = np.mean(lsd_arr)
                lsd_std_cyc = np.std(lsd_arr)
                logging.info("lsd_cyc: %.6f dB +- %.6f" %
                             (lsd_mean_cyc, lsd_std_cyc))
                lsd_cvlist_cyc.append(lsd_mean_cyc)
                lsdstd_cvlist_cyc.append(lsd_std_cyc)

                logging.info("synth anasyn")
                magsp = np.matmul(melfb_t, melsp_rest.T)
                logging.info(magsp.shape)
                hop_length = int((args.fs / 1000) * args.shiftms)
                win_length = int((args.fs / 1000) * args.winms)
                wav = np.clip(
                    librosa.core.griffinlim(magsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_anasyn.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                #if trg_exist:
                #    logging.info("synth anasyn_trg")
                #    wav = np.clip(pw.synthesize(f0_trg, sp_trg, ap_trg, fs, frame_period=args.shiftms), -1, 1)
                #    wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn_trg.wav"))
                #    sf.write(wavpath, wav, fs, 'PCM_16')
                #    logging.info(wavpath)

                logging.info("synth gf rec")
                recmagsp = np.matmul(melfb_t, melsp_src_rest.T)
                logging.info(recmagsp.shape)
                wav = np.clip(
                    librosa.core.griffinlim(recmagsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_rec.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                logging.info("synth gf cv")
                cvmagsp = np.matmul(melfb_t, melsp_cv_rest.T)
                logging.info(cvmagsp.shape)
                wav = np.clip(
                    librosa.core.griffinlim(cvmagsp,
                                            hop_length=hop_length,
                                            win_length=win_length,
                                            window='hann'), -1,
                    0.999969482421875)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(feat_file).replace(".h5", "_cv.wav"))
                logging.info(wavpath)
                sf.write(wavpath, wav, args.fs, 'PCM_16')

                logging.info('write to h5')
                outh5dir = os.path.join(
                    os.path.dirname(os.path.dirname(feat_file)),
                    spk_src + "-" + args.spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                # cv
                write_path = args.string_path
                logging.info(feat_file + ' ' + write_path)
                logging.info(feat_cv.shape)
                write_hdf5(feat_file, write_path, feat_cv)

                #logging.info('write lat to h5')
                #logging.info(feat_file + ' ' + args.string_path+'_lat')
                #logging.info(feat_lat.shape)
                #write_hdf5(feat_file, args.string_path+'_lat', feat_lat)

                count += 1