Exemple #1
0
def gen_waveform(y_predicted,
                 Y_mean,
                 Y_std,
                 post_filter=False,
                 coef=1.4,
                 fs=16000,
                 mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std,
                                        mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Exemple #2
0
def world2wav(feature, frame_period):
    hparams = hp
    mgc_idx = 0
    lf0_idx = mgc_idx + hparams.num_mgc
    vuv_idx = lf0_idx + hparams.num_lf0
    bap_idx = vuv_idx + hparams.num_vuv

    mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc]
    lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0]
    vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv]
    bap = feature[:, bap_idx:bap_idx + hparams.num_bap]

    fs = hparams.sample_rate
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    indexes = (vuv < 0.5).flatten()
    bap[indexes] = np.zeros(hparams.num_bap)

    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    return pyworld.synthesize(f0.flatten().astype(np.float64),
                              spectrogram.astype(np.float64),
                              aperiodicity.astype(np.float64), fs,
                              frame_period)
def vizualize_hardcoded(x, mgc, lf0, f0, vuv, fs, timeaxis):
    plt.subplot(5, 1, 1)
    plt.plot(x, label="Wav")
    plt.xlim(0, len(x))
    # Spec
    plt.subplot(5, 1, 2)
    sp = pysptk.mc2sp(mgc[:, :60], alpha=alpha, fftlen=fftlen)
    logsp = np.log(sp)
    librosa.display.specshow(logsp.T,
                             sr=fs,
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    # Lof_f0, Vuv
    plt.subplot(5, 1, 3)
    # plt.plot(np.exp(lf0[:,0]), linewidth=2, label="Continuous log-f0")
    plt.plot(f0, linewidth=2, label="Continuous log-f0")
    plt.xlim(0, len(f0))
    plt.subplot(5, 1, 4)
    plt.plot(vuv, linewidth=2, label="Voiced/unvoiced flag")
    plt.xlim(0, len(vuv))
    plt.legend(prop={"size": 14}, loc="upper right")
    # aperiodicity
    plt.subplot(5, 1, 5)
    bap = bap[:, :2]
    bap = np.ascontiguousarray(bap).astype(np.float64)
    aperiodicity = pyworld.decode_aperiodicity(bap, fs, fftlen)
    librosa.display.specshow(aperiodicity.T,
                             sr=fs,
                             hop_length=hop_length,
                             x_axis="time",
                             y_axis="linear")
    plt.show()
Exemple #4
0
def synthesis():
    # pdb.set_trace()
    lf0_file = "p225_001.lf0"
    bap_file_name="p225_001.bap"
    mgc_file_name="p225_001.mgc"
    fl=4096
    sr=48000
    # pdb.set_trace()
    lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32)
    zeros_index = np.where(lf0 == -1E+10)
    nonzeros_index = np.where(lf0 != -1E+10)
    f0 = lf0.copy()
    f0[zeros_index] = 0
    f0[nonzeros_index] = np.exp(lf0[nonzeros_index])
    f0 = f0.astype(np.float64)
    bap_dim = 5
    bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32)
    ap = pyworld.decode_aperiodicity(bap.astype(np.float64).reshape(-1, bap_dim), sr, fl)
    mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32)
    alpha = pysptk.util.mcepalpha(sr)
    sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=fl, alpha=alpha)
    wav = pyworld.synthesize(f0, sp, ap, sr, 5)
    x2 = wav * 32768
    x2 = x2.astype(np.int16)
    scipy.io.wavfile.write("resynthesis.wav", sr, x2)
Exemple #5
0
def gen_world_params(mgc, lf0, vuv, bap, sample_rate, vuv_threshold=0.3):
    """Generate WORLD parameters from mgc, lf0, vuv and bap.

    Args:
        mgc (ndarray): mgc
        lf0 (ndarray): lf0
        vuv (ndarray): vuv
        bap (ndarray): bap
        sample_rate (int): sample rate
        vuv_threshold (float): threshold for VUV

    Returns:
        tuple: tuple of f0, spectrogram and aperiodicity
    """
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)
    spectrogram = pysptk.mc2sp(np.ascontiguousarray(mgc), fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(
        np.ascontiguousarray(bap).astype(np.float64), sample_rate, fftlen
    )

    # fill aperiodicity with ones for unvoiced regions
    aperiodicity[vuv.reshape(-1) < vuv_threshold, :] = 1.0
    # WORLD fails catastrophically for out of range aperiodicity
    aperiodicity = np.clip(aperiodicity, 0.0, 1.0)

    f0 = lf0.copy()
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    f0[vuv < vuv_threshold] = 0

    f0 = f0.flatten().astype(np.float64)
    spectrogram = spectrogram.astype(np.float64)
    aperiodicity = aperiodicity.astype(np.float64)

    return f0, spectrogram, aperiodicity
    def gen_waveform(self, feature):
        mcep_dim = self.config['mcep_order'] + 1
        mgc = feature[:, :mcep_dim]
        lf0 = feature[:, mcep_dim:mcep_dim + 1]
        vuv = feature[:, mcep_dim + 1: mcep_dim + 2]
        bap = feature[:, mcep_dim + 2:]

        spectrogram = pysptk.mc2sp(
            mgc,
            fftlen=self.config['fft_size'],
            alpha=pysptk.util.mcepalpha(self.config['sampling_rate']),
        )
        aperiodicity = pyworld.decode_aperiodicity(
            bap.astype(np.float64),
            self.config['sampling_rate'],
            self.config['fft_size'],
        )
        f0 = lf0.copy()
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

        waveform = pyworld.synthesize(
            f0.flatten().astype(np.float64),
            spectrogram.astype(np.float64),
            aperiodicity.astype(np.float64),
            self.config['sampling_rate'],
            self.config['hop_size_in_ms'],
        )
        return waveform
Exemple #7
0
    def generate(self, parm_var, do_postfilter=True):
        config = self.analysis_config

        for path in self.paths:
            file_id = splitext(basename(path))[0]
            print('Synthesizing %s ... ' % (file_id), end='')
            mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var)

            if do_postfilter:
                mgc = merlin_post_filter(mgc, config.alpha)

            sp = pysptk.mc2sp(mgc,
                              fftlen=config.fft_length,
                              alpha=config.alpha)
            ap = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                             config.sampling_rate,
                                             config.fft_length)
            f0 = self._lf0_to_f0(lf0, vuv)
            generated = pyworld.synthesize(f0.flatten().astype(np.float64),
                                           sp.astype(np.float64),
                                           ap.astype(np.float64),
                                           config.sampling_rate,
                                           config.frame_period)
            with open(join(self.out_dir, file_id + '.wav'), 'wb') as f:
                f.write(Audio(generated, rate=config.sampling_rate).data)
            print('done!')
Exemple #8
0
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4,
                 fs=16000, mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Exemple #9
0
def gen_waveform(labels, acoustic_features, acoustic_out_scaler,
        binary_dict, continuous_dict, stream_sizes, has_dynamic_features,
        subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None,
        num_windows=3, post_filter=True, sample_rate=48000, frame_period=5,
        relative_f0=True):

    windows = get_windows(num_windows)

    # Apply MLPG if necessary
    if np.any(has_dynamic_features):
        acoustic_features = multi_stream_mlpg(
            acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes,
            has_dynamic_features)
        static_stream_sizes = get_static_stream_sizes(
            stream_sizes, has_dynamic_features, len(windows))
    else:
        static_stream_sizes = stream_sizes

    # Split multi-stream features
    mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes)

    # Gen waveform by the WORLD vocodoer
    fftlen = pyworld.get_cheaptrick_fft_size(sample_rate)
    alpha = pysptk.util.mcepalpha(sample_rate)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen)


    ### F0 ###
    if relative_f0:
        diff_lf0 = target_f0
        # need to extract pitch sequence from the musical score
        linguistic_features = fe.linguistic_features(labels,
                                                    binary_dict, continuous_dict,
                                                    add_frame_features=True,
                                                    subphone_features=subphone_features)
        f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None]
        lf0_score = f0_score.copy()
        nonzero_indices = np.nonzero(lf0_score)
        lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
        lf0_score = interp1d(lf0_score, kind="slinear")

        f0 = diff_lf0 + lf0_score
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    else:
        f0 = target_f0

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            sample_rate, frame_period)

    return generated_waveform
Exemple #10
0
def inv_world_spectrogram(f0, sp, ap, sr=_sr, **kwargs):
    """world声码器频谱转为语音。"""
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    sp_dec = pw.decode_spectral_envelope(sp, sr, fft_size=fft_size)
    ap_dec = pw.decode_aperiodicity(ap, sr, fft_size=fft_size)
    y = pw.synthesize(f0, sp_dec, ap_dec, sr, frame_period=frame_period)
    return y
Exemple #11
0
def synthesize(lf0, mgc, bap, hp):
    lf0 = np.where(lf0 < 1, 0.0, lf0)
    f0 = f0_denormalize(lf0)
    sp = sp_denormalize(mgc, hp)
    ap = ap_denormalize(bap, lf0)
    dec_ap = vocoder.decode_aperiodicity(ap, hp.sample_rate, fft_size=(sp.shape[1] - 1) * 2)
    print(f0.dtype, sp.dtype, dec_ap.dtype, flush=True)
    wav = vocoder.synthesize(f0, sp, dec_ap, hp.sample_rate)
    return wav
def load_timbre(path, m_type, mx, mn):
    load_t = np.load(path).astype(np.double)

    load_t = (load_t + 0.5) * (mx - mn) + mn
    decode_sp = decode_harmonic(load_t, fft_size)
    if m_type == 1:
        decode_sp = pw.decode_aperiodicity(load_t, 32000, fft_size)

    return decode_sp
Exemple #13
0
 def _resample_down_aperiodicity(cls, feature, fs, new_fs,
                                 new_spectrum_len):
     feature = np.ascontiguousarray(feature)
     coded_ap = pyworld.code_aperiodicity(feature, fs)
     num = cls._get_aperiodicity_num(new_fs)
     if num < coded_ap.shape[1]:
         coded_ap = np.ascontiguousarray(coded_ap[:, :num])
     return pyworld.decode_aperiodicity(coded_ap, new_fs,
                                        (new_spectrum_len - 1) * 2)
Exemple #14
0
def world_speech_synthesis(queue, wav_list, config):
    """WORLD speech synthesis

    Args:
        queue (multiprocessing.Queue): the queue to store the file name of utterance
        wav_list (list): list of the wav files
        config (dict): feature extraction config

    """
    # define synthesizer
    synthesizer = Synthesizer(fs=config['sampling_rate'],
                              fftl=config['fft_size'],
                              shiftms=config['shiftms'])
    # synthesis
    for i, wav_name in enumerate(wav_list):
        logging.info("now processing %s (%d/%d)" %
                     (wav_name, i + 1, len(wav_list)))

        # load acoustic features
        feat_name = path_replace(wav_name,
                                 config['indir'],
                                 config['outdir'],
                                 extname=config['feature_format'])
        if check_hdf5(feat_name, "/world"):
            h = read_hdf5(feat_name, "/world")
        else:
            logging.error("%s is not existed." % (feat_name))
            sys.exit(1)
        if check_hdf5(feat_name, "/f0"):
            f0 = read_hdf5(feat_name, "/f0")
        else:
            uv = h[:, config['uv_dim_idx']].copy(order='C')
            f0 = h[:, config['f0_dim_idx']].copy(order='C')  # cont_f0_lpf
            fz_idx = np.where(uv == 0.0)
            f0[fz_idx] = 0.0
        if check_hdf5(feat_name, "/ap"):
            ap = read_hdf5(feat_name, "/ap")
        else:
            codeap = h[:, config['ap_dim_start']:config['ap_dim_end']].copy(
                order='C')
            ap = pyworld.decode_aperiodicity(codeap, config['sampling_rate'],
                                             config['fft_size'])
        mcep = h[:, config['mcep_dim_start']:config['mcep_dim_end']].copy(
            order='C')

        # waveform synthesis
        wav = synthesizer.synthesis(f0, mcep, ap, alpha=config['mcep_alpha'])
        wav = np.clip(np.int16(wav), -32768, 32767)

        # save restored wav
        restored_name = path_replace(wav_name, "wav", "world", extname="wav")
        wavfile.write(restored_name, config['sampling_rate'], wav)

    queue.put('Finish')
def synthesis_from_mcep(f0, mcep, ap, sr, fftsize, shiftms, alpha, rmcep=None):
    if rmcep is not None:
        mcep = mod_power(mcep, rmcep, alpha=alpha)

    if ap.shape[1] < fftsize // 2 + 1:
        ap = pw.decode_aperiodicity(ap, sr, fftsize)

    sp = pysptk.mc2sp(mcep, alpha, fftsize)

    wav = pw.synthesize(f0, sp, ap, sr, frame_period=shiftms)

    return wav
 def gen_wav(self, f0, mgc, bap):
     spectrogram = pysptk.mc2sp(mgc, fftlen=self.fftlen, alpha=self.alpha)
     aperiodicity = pyworld.decode_aperiodicity(
         bap.astype(np.float64), self.sr, self.fftlen)
     generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(
         np.float64), aperiodicity.astype(np.float64), self.sr, self.frame_period)
     x2 = generated_waveform / np.max(generated_waveform) * 32768
     x2 = x2.astype(np.int16)
     wavfile.write("gen.wav", self.sr, x2)
     with open("gen.wav", 'rb') as fd:
         contents = fd.read()
     intensity = 10 * np.log10(np.sum(spectrogram**2, axis=1))
     return contents, intensity
Exemple #17
0
 def _resample_up_aperiodicity(cls, feature, fs, new_fs, new_spectrum_len):
     feature = np.ascontiguousarray(feature)
     coded_ap = pyworld.code_aperiodicity(feature, fs)
     num = cls._get_aperiodicity_num(new_fs)
     if num > coded_ap.shape[1]:
         freq_axis = np.hstack((np.arange(coded_ap.shape[1]),
                                new_fs / 2 / cls.FREQUENCY_INTERVAL - 1))
         coded_ap = np.hstack((coded_ap,
                               np.full((coded_ap.shape[0], 1),
                                       -cls.SAFE_GUARD_MINIMUM)))
         ap_interp = scipy.interpolate.interp1d(freq_axis, coded_ap, axis=1)
         coded_ap = np.ascontiguousarray(ap_interp(np.arange(num)))
     return pyworld.decode_aperiodicity(coded_ap, new_fs,
                                        (new_spectrum_len - 1) * 2)
    def test_synthesis_from_codeap(self):
        path = dirpath + '/data/test16000.wav'
        fs, x = wavfile.read(path)
        af = FeatureExtractor(analyzer='world', fs=fs, shiftms=5)
        f0, spc, ap = af.analyze(x)
        codeap = af.codeap()

        assert len(np.nonzero(f0)[0]) > 0
        assert spc.shape == ap.shape

        assert pyworld.get_num_aperiodicities(fs) == codeap.shape[-1]
        ap = pyworld.decode_aperiodicity(codeap, fs, 1024)

        synth = Synthesizer(fs=fs, fftl=1024, shiftms=5)
        wav = synth.synthesis_spc(f0, spc, ap)
        nun_check(wav)
Exemple #19
0
    def test_synthesis_from_codeap(self):
        path = dirpath + '/data/test16000.wav'
        fs, x = wavfile.read(path)
        af = FeatureExtractor(analyzer='world', fs=fs, shiftms=5)
        f0, spc, ap = af.analyze(x)
        codeap = af.codeap()

        assert len(np.nonzero(f0)[0]) > 0
        assert spc.shape == ap.shape

        assert pyworld.get_num_aperiodicities(fs) == codeap.shape[-1]
        ap = pyworld.decode_aperiodicity(codeap, fs, 1024)

        synth = Synthesizer(fs=fs, fftl=1024, shiftms=5)
        wav = synth.synthesis_spc(f0, spc, ap)
        nun_check(wav)
def generate_timbre(m_type, mx, mn, condition, cat_input=None):
    model_path = 'snapshots/harmonic'
    if m_type == 1:
        model_path = 'snapshots/aperiodic'
    model = load_latest_model_from(m_type, model_path)
    raw_gen = model.generate(condition, cat_input)
    sample = (raw_gen.transpose(0, 1).cpu().numpy().astype(np.double) +
              0.5) * (mx - mn) + mn

    decode_sp = None
    if m_type == 0:
        decode_sp = decode_harmonic(sample, fft_size)
    elif m_type == 1:
        decode_sp = pw.decode_aperiodicity(np.ascontiguousarray(sample), 32000,
                                           fft_size)

    return decode_sp, raw_gen
Exemple #21
0
def gen_waveform(y_predicted, do_postfilter=False):
    y_predicted = trim_zeros_frames(y_predicted)
    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted)
    if do_postfilter:
        mgc = merlin_post_filter(mgc, alpha)
    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    #print(bap.shape)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])
    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    return generated_waveform
Exemple #22
0
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """synthesis generates waveform from F0, mcep, aperiodicity

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
            array of F0 sequence
        mcep : array, shape (`T`, `dim`)
            array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`)
            array of aperiodicity or code aperiodicity
        rmcep : array, optional, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : int, optional
            Parameter of all-path transfer function
            Default set to 0.42

        Returns
        ----------
        wav: array,
            Synethesized waveform

        """

        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)

        if ap.shape[1] < self.fftl // 2 + 1:
            # decode codeap to ap
            ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl)

        # mcep into spc
        spc = pysptk.mc2sp(mcep, alpha, self.fftl)

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0,
                                 spc,
                                 ap,
                                 self.fs,
                                 frame_period=self.shiftms)

        return wav
def world2wav(
        clf0, vuv, cap, fs, fbin,
        mcep=None, sp=None, frame_period=None, mcep_postfilter=False):

    # setup
    frame_period = pyworld.default_frame_period \
        if frame_period is None else frame_period

    clf0 = np.ascontiguousarray(clf0.astype('float64'))
    vuv = np.ascontiguousarray(vuv > 0.5).astype('int')
    cap = np.ascontiguousarray(cap.astype('float64'))
    fft_len = fbin * 2 - 2
    alpha = pysptk.util.mcepalpha(fs)

    # clf0 2 f0
    f0 = np.squeeze(np.exp(clf0)) * np.squeeze(vuv)

    # cap 2 ap
    if cap.ndim != 2:
        cap = np.expand_dims(cap, 1)
    ap = pyworld.decode_aperiodicity(cap, fs, fft_len)

    # mcep 2 sp
    if sp is None:
        if mcep is None:
            raise ValueError

        else:
            mcep = np.ascontiguousarray(mcep.astype('float64'))
            if mcep_postfilter:
                mcep = merlin_post_filter(mcep, alpha)
            sp = pysptk.mgc2sp(mcep, alpha=alpha, fftlen=fft_len)
            sp = np.abs(np.exp(sp)) ** 2
    else:
        sp = np.ascontiguousarray(sp)

    wave = pyworld.synthesize(f0, sp, ap, fs, frame_period=frame_period)

    scale = np.abs(wave).max()
    if scale > 0.99:
        wave = wave / scale * 0.99

    return wave
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """synthesis generates waveform from F0, mcep, aperiodicity

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
            array of F0 sequence
        mcep : array, shape (`T`, `dim`)
            array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`)
            array of aperiodicity or code aperiodicity
        rmcep : array, optional, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : int, optional
            Parameter of all-path transfer function
            Default set to 0.42

        Returns
        ----------
        wav: array,
            Synethesized waveform

        """

        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)

        if ap.shape[1] < self.fftl // 2 + 1:
            # decode codeap to ap
            ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl)

        # mcep into spc
        spc = pysptk.mc2sp(mcep, alpha, self.fftl)

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0, spc, ap,
                                 self.fs, frame_period=self.shiftms)

        return wav
Exemple #25
0
def synth(f0_dir, ap_dir, mfsc_dir):

    files = os.listdir(f0_dir)

    for file in files:
        # file_name = file.split('.')[0]
        # file_name = '_'.join(file.split('_')[1:])  # Common file name

        # Get features for synthesis
        f0 = np.load(f0_dir + file)
        mfsc = np.load(mfsc_dir + file)
        ap = np.load(ap_dir + file)

        ap = pw.decode_aperiodicity(ap, 32000, 2048)

        # Convert MFSC to SP
        sp = mfsc_to_sp(mfsc)
        # Synthesize the audio
        _synth(file, f0, ap, sp)

    print('Finished synthesis')
Exemple #26
0
    def run_world_synth(self, synth_output, hparams):
        """Run the WORLD synthesize method."""
        fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs)

        save_dir = hparams.synth_dir if hparams.synth_dir is not None else hparams.out_dir if hparams.out_dir is not None else os.path.curdir
        for id_name, output in synth_output.items():
            logging.info("Synthesise {} with the WORLD vocoder.".format(id_name))

            coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps)
            ln_sp = pysptk.mgc2sp(np.ascontiguousarray(coded_sp, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size)
            # sp = np.exp(sp.real * 2.0)
            # sp.imag = sp.imag * 180.0 / np.pi
            sp = np.exp(ln_sp.real)
            sp = np.power(sp.real / 32768.0, 2)
            # sp = np.power(sp.real / 32768.0, 2)
            # sp = np.exp(np.power(sp.real, 2))
            # sp = pyworld.decode_spectral_envelope(np.ascontiguousarray(coded_sp, np.float64), self.synth_fs, fft_size)  # Cepstral version.
            f0 = np.exp(lf0, dtype=np.float64)
            vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0  # WORLD throws an error for too small f0 values.
            f0[vuv == 0] = 0.0
            ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), hparams.synth_fs, fft_size)

            waveform = pyworld.synthesize(f0, sp, ap, hparams.synth_fs)
            waveform = waveform.astype(np.float32, copy=False)  # Does inplace conversion, if possible.

            # Always save as wav file first and convert afterwards if necessary.
            wav_file_path = os.path.join(save_dir, "{}{}{}.wav".format(os.path.basename(id_name),
                                                                       "_" + hparams.model_name if hparams.model_name is not None else "",
                                                                       hparams.synth_file_suffix, "_WORLD", ".wav"))
            makedirs_safe(hparams.synth_dir)
            soundfile.write(wav_file_path, waveform, hparams.synth_fs)

            # Use PyDub for special audio formats.
            if hparams.synth_ext.lower() != 'wav':
                as_wave = pydub.AudioSegment.from_wav(wav_file_path)
                as_wave.export(os.path.join(hparams.synth_dir, id_name + "." + hparams.synth_ext), format=hparams.synth_ext)
                os.remove(wav_file_path)
Exemple #27
0
def decode_envelopes(spectral_coded, aperiodic_coded, sample_rate, vocal_name):
    # Reverse MFSC to MFCC mirror, remove mirror back. Reduce the scaling of DC and Nynquist frequencies
    # Convert back the MFCC to frequency

    fft_size = params.fft_size
    order = params.mcep_order
    coding_const = params.coding_const
    gamma = params.mcep_gamma
    alpha = params.mcep_alpha

    directory = params.training_dir + '/' + vocal_name + '/'
    [min_spec, max_spec, min_ap, max_ap] = np.load(directory + "min_max.npy",
                                                   allow_pickle=True)

    spectral_coded = (spectral_coded + coding_const) * (max_spec -
                                                        min_spec) + min_spec
    mirror = np.fft.irfft(spectral_coded)
    half_mirror = mirror[:, :order]
    half_mirror[:, 0] /= 2
    half_mirror[:, -1] /= 2

    spectral_env = np.exp(
        np.apply_along_axis(pysptk.mgc2sp,
                            1,
                            half_mirror,
                            alpha,
                            gamma,
                            fftlen=fft_size).real)

    aperiodic_coded = (aperiodic_coded + coding_const) * (max_ap -
                                                          min_ap) + min_ap
    aperiodic_coded = np.array(aperiodic_coded, order='C')
    aperiodic_env = pyworld.decode_aperiodicity(aperiodic_coded, sample_rate,
                                                fft_size)

    return spectral_env, aperiodic_env
Exemple #28
0
def bap2ap(bap, fs, fftlen):
    ap = pw.decode_aperiodicity(bap, fs, fftlen)
    return ap
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    #x, fs = sf.read('utterance/vaiueo2d.wav')
    x, fs = sf.read('utterance/p226_002.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap.
    code_sp = pw.code_spectral_envelope(sp, fs, 80)
    code_ap = pw.code_aperiodicity(ap, fs)
    fft_size = (sp.shape[1] - 1) * 2
    rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
    rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)
    y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs)
    print("fft size: {:d}".format(fft_size))
    print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0],
                                                code_sp.shape[1]))
    print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0],
                                                code_ap.shape[1]))

    # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms
    f0_xx, t_xx = pw.dio(x,
                         fs,
                         f0_floor=50.0,
                         f0_ceil=600.0,
                         channels_in_octave=2,
                         frame_period=12.5,
                         speed=args.speed)
    f0_xx = pw.stonemask(x, f0_xx, t_xx, fs)
    sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs)
    ap_xx = pw.d4c(x, f0_xx, t_xx, fs)
    code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80)
    code_ap_xx = pw.code_aperiodicity(ap_xx, fs)
    fft_size = (sp_xx.shape[1] - 1) * 2
    rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size)
    rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size)
    y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5)
    sf.write(
        'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav',
        y_r_xx, fs)
    print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0],
                                                   code_sp_xx.shape[1]))
    print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0],
                                                   code_ap_xx.shape[1]))

    # Comparison
    savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx])
    savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx])
    savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False)
    savefig('test/f0.png', [_f0, f0, f0_h, f0_xx])

    print('Please check "test" directory for output files')
 def decode_ap(ap: numpy.ndarray, sampling_rate: int):
     return pyworld.decode_aperiodicity(
         ap.astype(numpy.float64),
         sampling_rate,
         pyworld.get_cheaptrick_fft_size(sampling_rate),
     )
Exemple #31
0
end = timer()
print('Feature Extraction:', end - start, 'seconds')

# f0_new
from copy import deepcopy  # to avoid call by reference!!
f0_new = deepcopy(f0)  # 1-58 59-138 139-198 // 269-360 // 429-522
f0_new[1:198] = np.flip(f0_new[1:198], 0)  # reverse pitch
f0_new[269:360] = f0_new[269:360] + 62  #E(330hz) -> G (392hz)
f0_new[429:522] = f0_new[429:522] + 193  #E(330hz) -> G(523hz)

#%% reduce dimension of spectral envelope and aperiodicity.
enc_sp = pw.code_spectral_envelope(sp, fs, number_of_dimensions=32)
dec_sp = pw.decode_spectral_envelope(enc_sp,
                                     fs,
                                     fft_size=(sp.shape[1] - 1) * 2)

enc_ap = pw.code_aperiodicity(ap, fs)
dec_ap = pw.decode_aperiodicity(enc_ap, fs, fft_size=(ap.shape[1] - 1) * 2)

#%%
y = pw.synthesize(f0, sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis.wav', y, fs)
#%%
y = pw.synthesize(f0, dec_sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_resynthesis_sp_decode_32.wav', y,
                         fs)

#%% synthesis using new f0
y = pw.synthesize(f0_new, sp, ap, fs)
librosa.output.write_wav('y_EyesNose_short_new_F0_sp_decode_32.wav', y, fs)
Exemple #32
0
    # Change following 3 lines to specify directories
    fs = 32000
    fft_size = 2048
    f0_dir = './f0'
    ap_dir = './ap'
    mfsc_dir = './mfsc'

    f0 = np.load(
        'C:/Users/Murali/EE599/project/NIT/NIT/f0ref/nitech_jp_song070_f001_016.npy'
    )
    ap = np.load(
        'C:/Users/Murali/EE599/project/NIT/NIT/ap/nitech_jp_song070_f001_016.npy'
    )
    # mfsc_og = np.load('C:/Users/Murali/EE599/project/NIT/NIT/mfsc/nitech_jp_song070_f001_016.npy')

    ap = pw.decode_aperiodicity(ap, fs, fft_size)
    # ap = ap[0:1300]
    # f0 = f0[0:1300]
    # mfsc_og = mfsc_og[0:620]
    # np.save("mfsc016_.npy", mfsc_og)

    mfsc = np.load(
        'C:/Users/Murali/2018_synth_sing/tensorflow-wavenet/generated_016_new.npy'
    )
    mfsc = mfsc[20:]
    #    pos_idx = np.where(mfsc > 0)
    #    mfsc[pos_idx] = 0
    # mfsc = (-1) * np.abs(mfsc)
    # sp_og = mfsc_to_sp(mfsc_og)
    sp = mfsc_to_sp(mfsc)
Exemple #33
0
    def decode_RNN(feat_list, gpu, cvlist=None,
            mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None,\
            mcd_cvlist_cyc=None, mcdstd_cvlist_cyc=None, mcdpow_cvlist_cyc=None, mcdpowstd_cvlist_cyc=None,\
            mcd_cvlist=None, mcdstd_cvlist=None, mcdpow_cvlist=None, mcdpowstd_cvlist=None, \
            lat_dist_rmse_list=None, lat_dist_cosim_list=None):
        with torch.cuda.device(gpu):
            # define model and load parameters
            with torch.no_grad():
                model_encoder = GRU_VAE_ENCODER(
                    in_dim=config.mcep_dim+config.excit_dim,
                    n_spk=n_spk,
                    lat_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers_enc,
                    hidden_units=config.hidden_units_enc,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    causal_conv=config.causal_conv_enc,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_enc)
                logging.info(model_encoder)
                model_decoder = GRU_SPEC_DECODER(
                    feat_dim=config.lat_dim,
                    out_dim=config.mcep_dim,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_dec,
                    hidden_units=config.hidden_units_dec,
                    kernel_size=config.kernel_size_dec,
                    dilation_size=config.dilation_size_dec,
                    causal_conv=config.causal_conv_dec,
                    bi=False,
                    ar=False,
                    pad_first=True,
                    right_size=config.right_size_dec)
                logging.info(model_decoder)
                model_post = GRU_POST_NET(
                    spec_dim=config.mcep_dim,
                    excit_dim=2,
                    n_spk=n_spk,
                    hidden_layers=config.hidden_layers_post,
                    hidden_units=config.hidden_units_post,
                    kernel_size=config.kernel_size_post,
                    dilation_size=config.dilation_size_post,
                    causal_conv=config.causal_conv_post,
                    pad_first=True,
                    right_size=config.right_size_post)
                    #excit_dim=config.excit_dim,
                    #excit_dim=None,
                logging.info(model_post)
                model_encoder.load_state_dict(torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(torch.load(args.model)["model_decoder"])
                model_post.load_state_dict(torch.load(args.model)["model_post"])
                model_encoder.remove_weight_norm()
                model_decoder.remove_weight_norm()
                model_post.remove_weight_norm()
                model_encoder.cuda()
                model_decoder.cuda()
                model_post.cuda()
                model_encoder.eval()
                model_decoder.eval()
                model_post.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                for param in model_post.parameters():
                    param.requires_grad = False
            count = 0
            pad_left = (model_encoder.pad_left + model_decoder.pad_left + model_post.pad_left)*2
            pad_right = (model_encoder.pad_right + model_decoder.pad_right + model_post.pad_right)*2
            outpad_lefts = [None]*5
            outpad_rights = [None]*5
            outpad_lefts[0] = pad_left-model_encoder.pad_left
            outpad_rights[0] = pad_right-model_encoder.pad_right
            outpad_lefts[1] = outpad_lefts[0]-model_decoder.pad_left
            outpad_rights[1] = outpad_rights[0]-model_decoder.pad_right
            outpad_lefts[2] = outpad_lefts[1]-model_post.pad_left
            outpad_rights[2] = outpad_rights[1]-model_post.pad_right
            outpad_lefts[3] = outpad_lefts[2]-model_encoder.pad_left
            outpad_rights[3] = outpad_rights[2]-model_encoder.pad_right
            outpad_lefts[4] = outpad_lefts[3]-model_decoder.pad_left
            outpad_rights[4] = outpad_rights[3]-model_decoder.pad_right
            logging.info(f'{pad_left} {pad_right}')
            logging.info(outpad_lefts)
            logging.info(outpad_rights)
            for feat_file in feat_list:
                # convert mcep
                spk_src = os.path.basename(os.path.dirname(feat_file))
                src_idx = spk_list.index(spk_src)
                logging.info('%s --> %s' % (spk_src, args.spk_trg))

                file_trg = os.path.join(os.path.dirname(os.path.dirname(feat_file)), args.spk_trg, os.path.basename(feat_file))
                trg_exist = False
                if os.path.exists(file_trg):
                    logging.info('exist: %s' % (file_trg))
                    feat_trg = read_hdf5(file_trg, config.string_path)
                    mcep_trg = feat_trg[:,-config.mcep_dim:]
                    logging.info(mcep_trg.shape)
                    trg_exist = True

                feat_org = read_hdf5(feat_file, config.string_path)
                mcep = np.array(feat_org[:,-config.mcep_dim:])
                codeap = np.array(np.rint(feat_org[:,2:3])*(-np.exp(feat_org[:,3:config.excit_dim])))
                sp = np.array(ps.mc2sp(mcep, args.mcep_alpha, args.fftl))
                ap = pw.decode_aperiodicity(codeap, args.fs, args.fftl)
                feat_cvf0_lin = np.expand_dims(convert_f0(np.exp(feat_org[:,1]), src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std), axis=-1)
                feat_cv = np.c_[feat_org[:,:1], np.log(feat_cvf0_lin), feat_org[:,2:config.excit_dim]]

                logging.info("generate")
                with torch.no_grad():
                    feat = F.pad(torch.FloatTensor(feat_org).cuda().unsqueeze(0).transpose(1,2), (pad_left,pad_right), "replicate").transpose(1,2)
                    feat_excit = torch.FloatTensor(feat_org[:,:config.excit_dim]).cuda().unsqueeze(0)
                    feat_excit_cv = torch.FloatTensor(feat_cv).cuda().unsqueeze(0)

                    spk_logits, _, lat_src, _ = model_encoder(feat, sampling=False)
                    logging.info('input spkpost')
                    if outpad_rights[0] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:-outpad_rights[0]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[0]:], dim=-1), 1))

                    if trg_exist:
                        spk_trg_logits, _, lat_trg, _ = model_encoder(F.pad(torch.FloatTensor(feat_trg).cuda().unsqueeze(0).transpose(1,2), \
                                                            (model_encoder.pad_left,model_encoder.pad_right), "replicate").transpose(1,2), sampling=False)
                        logging.info('target spkpost')
                        logging.info(torch.mean(F.softmax(spk_trg_logits, dim=-1), 1))

                    cvmcep_src, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*src_idx).cuda().long(), lat_src)
                    cvmcep_src_post, _ = model_post(cvmcep_src, y=(torch.ones((1, cvmcep_src.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_rec, _ = model_encoder(torch.cat((F.pad(feat_excit.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep_src[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('rec spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep, _ = model_decoder((torch.ones((1, lat_src.shape[1]))*trg_idx).cuda().long(), lat_src)
                    cvmcep_post, _ = model_post(cvmcep, y=(torch.ones((1, cvmcep.shape[1]))*trg_idx).cuda().long(),
                                        e=F.pad(feat_excit_cv[:,:,:2].transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit_cv.transpose(1,2), (outpad_lefts[1],outpad_rights[1]), "replicate").transpose(1,2))
                    if model_post.pad_right > 0:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:-model_post.pad_right]), 2), 
                                                            sampling=False)
                    else:
                        spk_logits, _, lat_cv, _ = model_encoder(torch.cat((F.pad(feat_excit_cv.transpose(1,2), \
                                            (outpad_lefts[2],outpad_rights[2]), "replicate").transpose(1,2), cvmcep[:,model_post.pad_left:]), 2), 
                                                            sampling=False)
                    logging.info('cv spkpost')
                    if outpad_rights[3] > 0:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:-outpad_rights[3]], dim=-1), 1))
                    else:
                        logging.info(torch.mean(F.softmax(spk_logits[:,outpad_lefts[3]:], dim=-1), 1))

                    cvmcep_cyc, _ = model_decoder((torch.ones((1, lat_cv.shape[1]))*src_idx).cuda().long(), lat_cv)
                    cvmcep_cyc_post, _ = model_post(cvmcep_cyc, y=(torch.ones((1, cvmcep_cyc.shape[1]))*src_idx).cuda().long(),
                                        e=F.pad(feat_excit[:,:,:2].transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))
                                        #e=F.pad(feat_excit.transpose(1,2), (outpad_lefts[4],outpad_rights[4]), "replicate").transpose(1,2))

                    if outpad_rights[2] > 0:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:-outpad_rights[2]].cpu().data.numpy(), dtype=np.float64)
                    else:
                        cvmcep_src = np.array(cvmcep_src_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                        cvmcep = np.array(cvmcep_post[0,outpad_lefts[2]:].cpu().data.numpy(), dtype=np.float64)
                    cvmcep_cyc = np.array(cvmcep_cyc_post[0].cpu().data.numpy(), dtype=np.float64)

                    if trg_exist:
                        if outpad_rights[0] > 0:
                            lat_src = lat_src[:,outpad_lefts[0]:-outpad_rights[0]]
                        else:
                            lat_src = lat_src[:,outpad_lefts[0]:]

                logging.info(cvmcep_src.shape)
                logging.info(cvmcep.shape)
                logging.info(cvmcep_cyc.shape)

                if trg_exist:
                    logging.info(lat_src.shape)
                    logging.info(lat_trg.shape)
 
                cvlist.append(np.var(cvmcep[:,1:], axis=0))

                logging.info("cvf0lin")
                f0_range = read_hdf5(feat_file, "/f0_range")
                cvf0_range_lin = convert_f0(f0_range, src_f0_mean, src_f0_std, trg_f0_mean, trg_f0_std)
                uv_range_lin, cont_f0_range_lin = convert_continuos_f0(np.array(cvf0_range_lin))
                unique, counts = np.unique(uv_range_lin, return_counts=True)
                logging.info(dict(zip(unique, counts)))
                cont_f0_lpf_range_lin = \
                    low_pass_filter(cont_f0_range_lin, int(1.0 / (args.shiftms * 0.001)), cutoff=20)
                uv_range_lin = np.expand_dims(uv_range_lin, axis=-1)
                cont_f0_lpf_range_lin = np.expand_dims(cont_f0_lpf_range_lin, axis=-1)
                # plain converted feat for neural vocoder
                feat_cv = np.c_[uv_range_lin, np.log(cont_f0_lpf_range_lin), feat_cv[:,2:config.excit_dim], cvmcep]
                logging.info(feat_cv.shape)

                logging.info("mcd acc")
                spcidx = np.array(read_hdf5(feat_file, "/spcidx_range")[0])
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_src[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_src[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_src.append(mcdpow_mean)
                mcdpowstd_cvlist_src.append(mcdpow_std)
                mcd_cvlist_src.append(mcd_mean)
                mcdstd_cvlist_src.append(mcd_std)
                if trg_exist:
                    spcidx_trg = np.array(read_hdf5(file_trg, "/spcidx_range")[0])
                    _, _, _, mcdpow_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg], dtype=np.float64))
                    _, _, _, mcd_arr = dtw.dtw_org_to_trg(np.array(cvmcep[spcidx,1:], \
                                                dtype=np.float64), np.array(mcep_trg[spcidx_trg,1:], dtype=np.float64))
                    mcdpow_mean = np.mean(mcdpow_arr)
                    mcdpow_std = np.std(mcdpow_arr)
                    mcd_mean = np.mean(mcd_arr)
                    mcd_std = np.std(mcd_arr)
                    logging.info("mcdpow_trg: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                    logging.info("mcd_trg: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                    mcdpow_cvlist.append(mcdpow_mean)
                    mcdpowstd_cvlist.append(mcdpow_std)
                    mcd_cvlist.append(mcd_mean)
                    mcdstd_cvlist.append(mcd_std)
                    spcidx_src = torch.LongTensor(spcidx).cuda()
                    spcidx_trg = torch.LongTensor(spcidx_trg).cuda()
                    trj_lat_src = np.array(torch.index_select(lat_src[0],0,spcidx_src).cpu().data.numpy(), dtype=np.float64)
                    trj_lat_trg = np.array(torch.index_select(lat_trg[0],0,spcidx_trg).cpu().data.numpy(), dtype=np.float64)
                    aligned_lat_srctrg, _, _, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg)
                    lat_dist_srctrg = np.mean(np.sqrt(np.mean((aligned_lat_srctrg-trj_lat_trg)**2, axis=0)))
                    _, _, lat_cdist_srctrg, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src, mcd=0)
                    aligned_lat_trgsrc, _, _, _ = dtw.dtw_org_to_trg(trj_lat_trg, trj_lat_src)
                    lat_dist_trgsrc = np.mean(np.sqrt(np.mean((aligned_lat_trgsrc-trj_lat_src)**2, axis=0)))
                    _, _, lat_cdist_trgsrc, _ = dtw.dtw_org_to_trg(trj_lat_src, trj_lat_trg, mcd=0)
                    logging.info("%lf %lf %lf %lf" % (lat_dist_srctrg, lat_cdist_srctrg, lat_dist_trgsrc, lat_cdist_trgsrc))
                    lat_dist_rmse = (lat_dist_srctrg+lat_dist_trgsrc)/2
                    lat_dist_cosim = (lat_cdist_srctrg+lat_cdist_trgsrc)/2
                    lat_dist_rmse_list.append(lat_dist_rmse)
                    lat_dist_cosim_list.append(lat_dist_cosim)
                    logging.info("lat_dist: %.6f %.6f" % (lat_dist_rmse, lat_dist_cosim))
                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[spcidx], dtype=np.float64), np.array(cvmcep_cyc[spcidx], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[spcidx,1:], dtype=np.float64), np.array(cvmcep_cyc[spcidx,1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_cyc_cv: %.6f dB +- %.6f" % (mcdpow_mean, mcdpow_std))
                logging.info("mcd_cyc_cv: %.6f dB +- %.6f" % (mcd_mean, mcd_std))
                mcdpow_cvlist_cyc.append(mcdpow_mean)
                mcdpowstd_cvlist_cyc.append(mcdpow_std)
                mcd_cvlist_cyc.append(mcd_mean)
                mcdstd_cvlist_cyc.append(mcd_std)

                logging.info("synth anasyn")
                wav = np.clip(pw.synthesize(f0_range, sp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5","_anasyn.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco rec")
                cvsp_src = ps.mc2sp(cvmcep_src, args.mcep_alpha, args.fftl)
                logging.info(cvsp_src.shape)
                wav = np.clip(pw.synthesize(f0_range, cvsp_src, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_rec.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv")
                cvsp = ps.mc2sp(cvmcep, args.mcep_alpha, args.fftl)
                logging.info(cvsp.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cv.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth voco cv GV")
                datamean = np.mean(cvmcep[:,1:], axis=0)
                cvmcep_gv =  np.c_[cvmcep[:,0], args.gv_coeff*(np.sqrt(gv_mean_trg/cvgv_mean) * \
                                    (cvmcep[:,1:]-datamean) + datamean) + (1-args.gv_coeff)*cvmcep[:,1:]]
                cvmcep_gv = mod_pow(cvmcep_gv, cvmcep, alpha=args.mcep_alpha, irlen=IRLEN)
                cvsp_gv = ps.mc2sp(cvmcep_gv, args.mcep_alpha, args.fftl)
                logging.info(cvsp_gv.shape)
                wav = np.clip(pw.synthesize(cvf0_range_lin, cvsp_gv, ap, args.fs, frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_cvGV.wav"))
                sf.write(wavpath, wav, args.fs, 'PCM_16')
                logging.info(wavpath)

                #logging.info("synth diffGV")
                #shiftl = int(args.fs/1000*args.shiftms)
                #mc_cv_diff = cvmcep_gv-mcep
                #b = np.apply_along_axis(ps.mc2b, 1, mc_cv_diff, args.mcep_alpha)
                #logging.info(b.shape)
                #assert np.isfinite(b).all
                #mlsa_fil = ps.synthesis.Synthesizer(MLSADF(mcep_dim, alpha=args.mcep_alpha), shiftl)
                #x, fs_ = sf.read(os.path.join(os.path.dirname(feat_file).replace("hdf5", "wav_filtered"), os.path.basename(feat_file).replace(".h5", ".wav")))
                #assert(fs_ == args.fs)
                #wav = mlsa_fil.synthesis(x, b)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir, os.path.basename(feat_file).replace(".h5", "_DiffGV.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("synth diffGVF0")
                #time_axis = read_hdf5(feat_file, "/time_axis")
                #sp_diff = pw.cheaptrick(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff.shape)
                #ap_diff = pw.d4c(wav, f0_range, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(ap_diff.shape)
                #wav = pw.synthesize(cvf0_range_lin, sp_diff, ap_diff, args.fs, frame_period=args.shiftms)
                #wav = np.clip(wav, -1, 1)
                #wavpath = os.path.join(args.outdir,os.path.basename(feat_file).replace(".h5", "_DiffGVF0.wav"))
                #sf.write(wavpath, wav, args.fs, 'PCM_16')
                #logging.info(wavpath)

                #logging.info("analysis diffGVF0")
                #sp_diff_anasyn = pw.cheaptrick(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #logging.info(sp_diff_anasyn.shape)
                #mc_cv_diff_anasyn = ps.sp2mc(sp_diff_anasyn, mcep_dim, args.mcep_alpha)
                #ap_diff_anasyn = pw.d4c(wav, cvf0_range_lin, time_axis, args.fs, fft_size=args.fftl)
                #code_ap_diff_anasyn = pw.code_aperiodicity(ap_diff_anasyn, args.fs)
                ## convert to continouos codeap with uv
                #for i in range(code_ap_diff_anasyn.shape[-1]):
                #    logging.info('codeap: %d' % (i+1))
                #    uv_codeap_i, cont_codeap_i = convert_continuos_codeap(np.array(code_ap_diff_anasyn[:,i]))
                #    cont_codeap_i = np.log(-np.clip(cont_codeap_i, a_min=np.amin(cont_codeap_i), a_max=MAX_CODEAP))
                #    if i > 0:
                #        cont_codeap = np.c_[cont_codeap, np.expand_dims(cont_codeap_i, axis=-1)]
                #    else:
                #        uv_codeap = np.expand_dims(uv_codeap_i, axis=-1)
                #        cont_codeap = np.expand_dims(cont_codeap_i, axis=-1)
                #    uv_codeap_i = np.expand_dims(uv_codeap_i, axis=-1)
                #    unique, counts = np.unique(uv_codeap_i, return_counts=True)
                #    logging.info(dict(zip(unique, counts)))
                ## postprocessed converted feat for neural vocoder
                #feat_diffgv_anasyn = np.c_[feat_cv[:,:2], uv_codeap, cont_codeap, mc_cv_diff_anasyn]

                #logging.info("write lat")
                #outTxtDir = os.path.join(args.outdir, os.path.basename(os.path.dirname(feat_file)))
                #if not os.path.exists(outTxtDir):
                #    os.mkdir(outTxtDir)
                #outTxt = os.path.join(outTxtDir, os.path.basename(feat_file).replace(".wav", ".txt"))
                #logging.info(outTxt)
                #g = open(outTxt, "wt")
                #idx_frm = 0 
                #nfrm = trj_lat_src.shape[0]
                #dim = trj_lat_src.shape[1]
                #if not args.time_flag:
                ##if True:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                g.write("%lf " % (elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #else:
                #    while idx_frm < nfrm:
                #        idx_elmt = 1 
                #        for elmt in trj_lat_src[idx_frm]:
                #            if idx_elmt < dim:
                #                if idx_elmt > 1:
                #                    g.write("%lf " % (elmt))
                #                else:
                #                    g.write("%lf %lf " % (time_axis[idx_frm], elmt))
                #            else:
                #                g.write("%lf\n" % (elmt))
                #            idx_elmt += 1
                #        idx_frm += 1
                #g.close()

                logging.info('write to h5')
                outh5dir = os.path.join(os.path.dirname(os.path.dirname(feat_file)), spk_src+"-"+args.spk_trg)
                if not os.path.exists(outh5dir):
                    os.makedirs(outh5dir)
                feat_file = os.path.join(outh5dir, os.path.basename(feat_file))
                # cv
                write_path = args.string_path
                logging.info(feat_file + ' ' + write_path)
                logging.info(feat_cv.shape)
                write_hdf5(feat_file, write_path, feat_cv)
                ## diffGVF0
                #write_path = args.string_path+"_diffgvf0"
                #logging.info(feat_file + ' ' + write_path)
                #logging.info(feat_diffgv_anasyn.shape)
                #write_hdf5(feat_file, write_path, feat_diffgv_anasyn)

                count += 1