Beispiel #1
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
    def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced: numpy.ndarray = ~(f0 == 0)

        if len(x) % fft_length > 0:
            f0 = f0[:-1]
            t = t[:-1]
            sp = sp[:-1]
            ap = ap[:-1]
            mc = mc[:-1]
            coded_ap = coded_ap[:-1]
            voiced = voiced[:-1]

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        return feature
Beispiel #3
0
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
Beispiel #4
0
def wav2world(wavfile, frame_period):
    wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64)
    if hp.use_harvest:
        f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period)
    else:
        f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(wav, f0, timeaxis, fs)

    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    hp.num_bap = bap.shape[1]
    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if hp.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    #print(mgc.shape,lf0.shape,vuv.shape,bap.shape)
    features = np.hstack((mgc, lf0, vuv, bap))
    return features.astype(np.float32)
 def collect_features(self, wav_path):
     
     # x: Raw audio, (Sample_length, )
     x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64)
     
     
     # f0: F0, (Frame_length, ) 
     # lf0: log(f0) --> interp1d (Frame_length, )
     # vuv: voice/unvoiced (Frame_length, )
     f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)])
     lf0 = interp1d(lf0, kind="slinear")
     vuv = (lf0 != 0).astype(np.float32)
     
     
     # spec: Spectrogram, (Frame_length x Dim), Dim = 513
     # bap: coded aperiodicity, (Frame_length, )
     # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60
     spec = pyworld.cheaptrick(x, f0, timeaxis, fs)
     aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)        
     bap = pyworld.code_aperiodicity(aperiodicity, fs)
     mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs))
     
     
     # Stacking Features: total dimesnion = 64
     features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec))
     return features.astype(np.float32)
Beispiel #6
0
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs):
    """world声码器语音转为频谱。"""
    # 分布提取参数
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    ap_threshold = kwargs.get("ap_threshold", 0.85)
    f0_extractor = kwargs.get("f0_extractor", "dio")
    x = wav.astype(np.double)
    if f0_extractor == "dio":
        # 使用DIO算法计算音频的基频F0
        f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil)
    elif f0_extractor == "harvest":
        f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)
    else:
        f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)

    # 使用CheapTrick算法计算音频的频谱包络
    sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size)
    # SP降维
    sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num)

    # 计算aperiodic参数
    ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size)
    # AP降维
    ap_enc = pw.code_aperiodicity(ap, sr)
    return f0, sp_enc, ap_enc
Beispiel #7
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000,
                    endian='LITTLE') #, start=56640, stop=262560)

    sr = 32000
    if osr != sr:
        y = librosa.resample(y, osr, sr)

    #使用harvest算法计算音频的基频F0
    _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period)
    _f0 = pw.stonemask(y, _f0, t, sr)
    print(_f0.shape)

    #使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = code_harmonic(_sp, 60)
    print(_sp.shape, code_sp.shape)
    #计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    return _f0, _sp, code_sp, _ap, code_ap
def _process_feature(out_dir, index, wav_path, label_path):

    # get list of wav files
    wav_files = os.listdir(os.path.dirname(wav_path))
    # check wav_file
    assert len(
        wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!"

    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    n_frames = len(f0)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    features = np.hstack((mgc, lf0, vuv, bap))

    # get list of lab files
    lab_files = os.listdir(os.path.dirname(label_path))
    # check wav_file
    assert len(
        lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!"

    # Cut silence frames by HTS alignment
    labels = hts.load(label_path)
    features = features[:labels.num_frames()]
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the acoustic to disk:
    acoustic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, acoustic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    dataset_ids.append(acoustic_filename[:-4])
    with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'),
              'wb') as pklFile:
        pickle.dump(dataset_ids, pklFile)

    # Return a tuple describing this training example:
    return (acoustic_filename, n_frames, voiced_frames)
Beispiel #9
0
 def _resample_down_aperiodicity(cls, feature, fs, new_fs,
                                 new_spectrum_len):
     feature = np.ascontiguousarray(feature)
     coded_ap = pyworld.code_aperiodicity(feature, fs)
     num = cls._get_aperiodicity_num(new_fs)
     if num < coded_ap.shape[1]:
         coded_ap = np.ascontiguousarray(coded_ap[:, :num])
     return pyworld.decode_aperiodicity(coded_ap, new_fs,
                                        (new_spectrum_len - 1) * 2)
Beispiel #10
0
def get_feature(wav_path, preprocessing=False, getsize=False):
    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    if audio_world_config.use_harvest:
        f0, timeaxis = pyworld.harvest(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
    else:
        f0, timeaxis = pyworld.dio(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)

    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=audio_world_config.mgc_dim,
                       alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if audio_world_config.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind)

    # Parameter trajectory smoothing
    if audio_world_config.mod_spec_smoothing:
        hop_length = int(fs * (audio_world_config.frame_period * 0.001))
        modfs = fs / hop_length
        mgc = P.modspec_smoothing(
            mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff)

    mgc = P.delta_features(mgc, audio_world_config.windows)
    lf0 = P.delta_features(lf0, audio_world_config.windows)
    bap = P.delta_features(bap, audio_world_config.windows)

    features = np.hstack((mgc, lf0, vuv, bap))
    if preprocessing:
        out_path = wav_path.replace(".wav", "").replace("wav", "world")
        np.save(out_path, features)
    elif getsize:
        feature, mgc.shape[0], lf0.shape[0], bap.shape[0]
    else:
        return features
Beispiel #11
0
def analyze(x, fs, f0_floor, f0_ceil, frame_period=20.0, pitchshift=None):
    if pitchshift is not None:
        f0, spc, ap = analyze_world(x, fs * pitchshift, f0_floor, f0_ceil, frame_period / pitchshift)
    else:
        f0, spc, ap = analyze_world(x, fs, f0_floor, f0_ceil, frame_period)
    mcep = pysptk.sp2mc(spc, 24, 0.410)
    codeap = pyworld.code_aperiodicity(ap, fs)
    
    #return x, fs, f0, time_axis, spc, ap, mcep, codeap
    return f0, mcep, codeap
Beispiel #12
0
def get_features(x, fs):
    # f0 calculate
    _f0, t = pw.dio(x, fs)
    f0 = pw.stonemask(x, _f0, t, fs)
    # mcep calculate
    sp = trim_zeros_frames(pw.cheaptrick(x, f0, t, fs))
    mcep = pysptk.sp2mc(sp, order=24, alpha=pysptk.util.mcepalpha(fs))
    # bap calculate
    ap = pw.d4c(x, f0, t, fs)
    bap = pw.code_aperiodicity(ap, fs)
    return f0, mcep, bap
Beispiel #13
0
def _process_utterance(out_dir, index, wav_path, text, phone):
    '''Preprocesses a single utterance audio/text pair.

    This writes the mel and linear scale spectrograms to disk and returns a tuple to write
    to the train.txt file.

    Args:
      out_dir: The directory to write the spectrograms into
      index: The numeric index to use in the spectrogram filenames.
      wav_path: Path to the audio file containing the speech input
      text: The text spoken in the input audio file

    Returns:
      A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt
    '''

    # Load the audio to a numpy array:
    wav = audio.load_wav(wav_path)

    if hparams.rescaling:
        wav = wav / np.abs(wav).max() * hparams.rescaling_max

    if hparams.vocoder=="world":
        spectrogram = audio.spectrogram(wav).astype(np.float32)

        f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate)
        ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate)
        sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim)
        
        world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded])
        n_frames = world_spec.shape[0]
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-world-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False)

    else:
        # Compute the linear-scale spectrogram from the wav:
        spectrogram = audio.spectrogram(wav).astype(np.float32)
        n_frames = spectrogram.shape[1]

        # Compute a mel-scale spectrogram from the wav:
        mel_spectrogram = audio.melspectrogram(wav).astype(np.float32)

        # Write the spectrograms to disk:
        spectrogram_filename = 'synpaflex-spec-%05d.npy' % index
        encoded_filename = 'synpaflex-mel-%05d.npy' % index
        np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False)
        np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False)


    # Return a tuple describing this training example:
    return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def process(filename):
    '''
    The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity
    :param filename: path to wav file
    :return: .lf0, .mgc and .bap files
    '''
    # pdb.set_trace()
    file_id = os.path.basename(filename).split(".")[0]
    print('\n' + file_id)

    ### WORLD ANALYSIS -- extract vocoder parameters ###
    # x, fs = librosa.core.load(filename, sr=16000)
    fs, x = wavfile.read(filename)
    # warnning this parameter is important
    alpha = pysptk.util.mcepalpha(fs)
    hopesize = int(0.005 * fs)
    # pdb.set_trace()
    f0 = pysptk.rapt(x.astype(np.float32),
                     fs=fs,
                     hopsize=hopesize,
                     min=60,
                     max=600,
                     voice_bias=0.0,
                     otype=1)
    f0 = f0.astype(np.float64)
    x = x.astype(np.float64) / (2**15)
    _, timeaxis = pyworld.harvest(x,
                                  fs,
                                  frame_period=5,
                                  f0_floor=60.0,
                                  f0_ceil=600)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    f0 = f0[:, None]
    lf0 = f0.copy()
    lf0 = lf0.astype(np.float32)
    nonzero_indices = np.where(f0 != 0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    zero_indices = np.where(f0 == 0)
    lf0[zero_indices] = -1.0E+10
    write_binfile(lf0,
                  os.path.join(lf0_dir, file_id + '.lf0'),
                  dtype=np.float32)
    mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha)
    mc = mc.astype(np.float32)
    write_binfile(mc,
                  os.path.join(mgc_dir, file_id + '.mgc'),
                  dtype=np.float32)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    bap = bap.astype(np.float32)
    write_binfile(bap,
                  os.path.join(bap_dir, file_id + '.bap'),
                  dtype=np.float32)
Beispiel #15
0
    def feature_extract(wav_list, arr):
        n_sample = 0
        n_frame = 0
        max_frame = 0
        count = 1
        coeff = np.array([-0.5, 0.5, 0.0])
        for wav_name in wav_list:
            # load wavfile and apply low cut filter
            fs, x = read_wav(wav_name, cutoff=70)
            n_sample += x.shape[0]
            logging.info(wav_name + " " + str(x.shape[0]) + " " +
                         str(n_sample) + " " + str(count))

            # check sampling frequency
            if not fs == args.fs:
                logging.debug("ERROR: sampling frequency is not matched.")
                sys.exit(1)

            hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(
                ".wav", ".h5")

            # extimate f0 and ap
            time_axis, f0, spc, ap = analyze_range(x,
                                                   fs=args.fs,
                                                   minf0=minf0,
                                                   maxf0=maxf0,
                                                   fperiod=args.shiftms,
                                                   fftl=args.fftl)
            write_hdf5(hdf5name, '/ap', ap)
            write_hdf5(hdf5name, "/f0", f0)

            # convert to continuous f0 and low-pass filter
            uv, cont_f0 = convert_continuos_f0(np.array(f0))
            cont_f0_lpf = low_pass_filter(cont_f0,
                                          int(1.0 / (args.shiftms * 0.001)),
                                          cutoff=20)

            cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1)
            uv = np.expand_dims(uv, axis=-1)

            write_hdf5(hdf5name, "/lcf0", np.log(cont_f0_lpf))
            write_hdf5(hdf5name, "/uv", uv)

            # extimate codeap
            codeap = pw.code_aperiodicity(ap, args.fs)
            if codeap.ndim == 1:
                # when fs == 16000
                codeap = np.expand_dims(codeap, axis=-1)
            write_hdf5(hdf5name, "/codeap", codeap)

            # mcep
            mcep = ps.sp2mc(spc, args.mcep_dim, mcep_alpha)
            write_hdf5(hdf5name, "/mcep", mcep)
Beispiel #16
0
 def _resample_up_aperiodicity(cls, feature, fs, new_fs, new_spectrum_len):
     feature = np.ascontiguousarray(feature)
     coded_ap = pyworld.code_aperiodicity(feature, fs)
     num = cls._get_aperiodicity_num(new_fs)
     if num > coded_ap.shape[1]:
         freq_axis = np.hstack((np.arange(coded_ap.shape[1]),
                                new_fs / 2 / cls.FREQUENCY_INTERVAL - 1))
         coded_ap = np.hstack((coded_ap,
                               np.full((coded_ap.shape[0], 1),
                                       -cls.SAFE_GUARD_MINIMUM)))
         ap_interp = scipy.interpolate.interp1d(freq_axis, coded_ap, axis=1)
         coded_ap = np.ascontiguousarray(ap_interp(np.arange(num)))
     return pyworld.decode_aperiodicity(coded_ap, new_fs,
                                        (new_spectrum_len - 1) * 2)
Beispiel #17
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Beispiel #18
0
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms,
                         mcep_order, windows):
    fs, audio = wavfile.read(wav_path)
    audio = audio.astype(np.float64) / 2**15
    if fs != sampling_rate:
        audio = audio.astype(np.float32)
        audio = librosa.resample(audio, fs, sampling_rate)
        audio = (audio * 2**15).astype(np.float64)
    # extract f0
    f0, timeaxis = pyworld.dio(audio,
                               sampling_rate,
                               frame_period=hop_size_in_ms)
    # modify f0
    f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate)
    # voiced/unvoiced flag
    vuv = (f0 > 0)[:, None].astype(np.float32)
    # calculate log f0
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    # interpolate f0 in log-domain
    lf0 = interp1d(lf0, kind='slinear')[:, None]

    # calculate mel-cepstrum
    spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate)
    mgc = pysptk.sp2mc(spectrogram,
                       order=mcep_order,
                       alpha=pysptk.util.mcepalpha(sampling_rate))
    # calculate aperiodicity parameter
    aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate)
    bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate)

    # calculate dynamic features
    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    feature = np.hstack((mgc, lf0, vuv, bap))

    # cut silence frames by HTS alignment
    labels = hts.load(lab_path)
    feature = feature[:labels.num_frames()]
    if labels.num_frames() > len(feature):
        return
    indices = labels.silence_frame_indices()
    feature = np.delete(feature, indices, axis=0)

    return feature.astype(np.float32)
Beispiel #19
0
    def codeap(self):
        """Return coded aperiodicity sequence

        Returns
        -------
        codeap : array, shape (`T`, `dim`)
            Encoded aperiodicity sequence of the waveform
            The `dim` of code ap is defined based on the `fs` as follow:
            fs = `16000` : `1`
            fs = `22050` : `2`
            fs = `44100` : `5`
            fs = `48000` : `5`
        """

        self._analyzed_check()

        return pyworld.code_aperiodicity(self._ap, self.fs)
Beispiel #20
0
def _extract_static_feats(wav, sr):
    f0, timeaxis = pyworld.dio(wav, sr, frame_period=5)
    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr)

    mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")
    bap = pyworld.code_aperiodicity(aperiodicity, sr)

    feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32)
    stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]]

    return feats, stream_sizes
Beispiel #21
0
    def collect_features(self, wav_path, label_path):
        #print(wav_path)
        #fs, x = wavfile.read(wav_path)
        d = wavio.read(wav_path)
        fs, x = d.rate, d.data
        print(fs, wav_path)
        if len(x.shape) > 1:
            x = x[:, 0]
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)  #1
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)  #180
        lf0 = apply_delta_windows(lf0, windows)  #3
        bap = apply_delta_windows(bap, windows)  #3 biaobei 15

        features = np.hstack((mgc, lf0, vuv, bap))  # 187 biaobei 199
        #print('mgc:',mgc.shape)
        #print('lf0:', lf0.shape)
        #print('vuv:', vuv.shape)
        #print('bap:', bap.shape)

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        if len(indices) > 0:
            features = np.delete(features, indices, axis=0)
        #print(features.shape) #
        return features.astype(np.float32)
Beispiel #22
0
def extract_timbre_data(args):
    audio_data, frequency, timing, sample_rate = args
    # Spectral envelope is taking the frequency-time of the audio and taking short time windows (frames)
    # and Fourier transforming them, to convert to the frequency domain
    spectral_data = pyworld.cheaptrick(audio_data, frequency, timing,
                                       sample_rate)
    aperiodic_data = pyworld.d4c(audio_data, frequency, timing, sample_rate)

    # 1. First take spectral envelope and convert it to mel cepstrum (MFCC)
    #     1.1 Spectral envelope is the Short time fourier transform of the frequencies to freuqency bins
    #     1.2 In MFCC we first map the powers of the spectrum to the mel scale
    #     1.3 Take the logs of each mel frequency and take the Discrete Cosine Transform to get MFCC
    #     1.4 MFCC are in the form of amplitudes. The bands used are in the range of what humans can distinct
    #     rather than normal ranges (in normal spec env each band human cant distictively identify)
    # 2. After breaking down into bins in MFCC the DC frequency (at bin 0) and Nyquist (last frequency) scaled by two
    # 3. Using the above a mirror spectrum is created
    # 4. The fourier transform is taken to get the MFSC. MFCC -> Discrete conside transform -> MFSC.
    # Reverse done here to get real values in frequency range

    mcep_floor = 10**(-80 / 20)

    spectral_mel = np.apply_along_axis(pysptk.mcep,
                                       1,
                                       spectral_data,
                                       params.mcep_order - 1,
                                       params.mcep_alpha,
                                       itype=params.mcep_input,
                                       threshold=mcep_floor)
    scale_mel = copy.copy(spectral_mel)
    scale_mel[:, 0] *= 2
    scale_mel[:, -1] *= 2
    # Create mirror. scale_mel[:, -1:0:-1]] -> all rows, from last column to first,
    # in reverse (the last -1 in the notation)
    mirror = np.hstack([scale_mel[:, :-1], scale_mel[:, -1:0:-1]])
    mfsc = np.fft.rfft(mirror).real
    spectral_data = pd.DataFrame(mfsc)

    aperiodic_data = pyworld.code_aperiodicity(aperiodic_data, sample_rate)
    aperiodic_data = pd.DataFrame(aperiodic_data)

    return spectral_data, aperiodic_data
Beispiel #23
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x,
                                   fs,
                                   frame_period=hp_acoustic.frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=hp_acoustic.order,
                           alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # 50hz parameter trajectory smoothing
        hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
        modfs = fs / hop_length
        mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Beispiel #24
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path,
                     subtype='PCM_16',
                     channels=1,
                     samplerate=48000,
                     endian='LITTLE')  # , start=56640, stop=262560)

    sr = 32000
    y = librosa.resample(y, osr, sr)

    # 使用DIO算法计算音频的基频F0
    _f0, t = pw.dio(y,
                    sr,
                    f0_floor=50.0,
                    f0_ceil=800.0,
                    channels_in_octave=2,
                    frame_period=pw.default_frame_period)
    print(_f0.shape)

    # 使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = pw.code_spectral_envelope(_sp, sr, 60)
    print(_sp.shape, code_sp.shape)
    # 计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    np.save('data/prepared_data/f0', _f0)
    np.save('data/prepared_data/ap', code_ap)

    # 合成原始语音
    synthesized = pw.synthesize(_f0 - 200, _sp, _ap, 32000,
                                pw.default_frame_period)
    # 1.输出原始语音
    sf.write('./data/gen_wav/test-200.wav', synthesized, 32000)
Beispiel #25
0
    def collect_features(self, wav_path):
        '''
        Args:
            wav_path: str
                - path to wav files

        Returns:
            x: np.ndarray (T,)   - time domain audio signal
            mgc: np.ndarray   - time domain audio signal
        '''
        fs, x = wavfile.read(wav_path)
        g, f = x.T
        x = g[:fs * 8].astype(np.float64)

        f0, timeaxis = pyworld.dio(x, fs, frame_period=self.frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=self.order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)

        mgc = apply_delta_windows(mgc, self.windows)
        lf0 = apply_delta_windows(lf0, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        return x, mgc, lf0, f0, bap, vuv, fs, timeaxis
Beispiel #26
0
def process_utterance(wav: np.ndarray,
                      text: str,
                      out_dir: Path,
                      basename: str,
                      skip_existing: bool,
                      hparams,
                      random_uttBasename_forSpkEmbedding=None):
    '''
    random_uttBasename_forSpkEmbedding: if not None, use the utterance to generate speaker embedding in synthesizer training.
    '''
    ## FOR REFERENCE:
    # For you not to lose your head if you ever wish to change things here or implement your own
    # synthesizer.
    # - Both the audios and the mel spectrograms are saved as numpy arrays
    # - There is no processing done to the audios that will be saved to disk beyond volume
    #   normalization (in split_on_silences)
    # - However, pre-emphasis is applied to the audios before computing the mel spectrogram. This
    #   is why we re-apply it on the audio on the side of the vocoder.
    # - Librosa pads the waveform before computing the mel spectrogram. Here, the waveform is saved
    #   without extra padding. This means that you won't have an exact relation between the length
    #   of the wav and of the mel spectrogram. See the vocoder data loader.

    # Skip existing utterances if needed
    mel_fpath = out_dir.joinpath("mels", "mel-%s.npy" % basename)
    wav_fpath = out_dir.joinpath("audio", "audio-%s.npy" % basename)
    if skip_existing and mel_fpath.exists() and wav_fpath.exists():
        return None

    # Skip utterances that are too short
    if len(wav) < hparams.utterance_min_duration * hparams.sample_rate:
        return None

    wav = trim_long_silences(wav, hparams.vad_window_length,
                             hparams.sample_rate)

    # Compute the mel spectrogram
    wav = wav.astype(np.float64)

    # feature extraction
    f0, sp, ap = pw.wav2world(wav, hparams.sample_rate)
    n_frames = len(f0)

    # reduce the dimension of ap from 513 to 2
    enc_ap = pw.code_aperiodicity(ap, hparams.sample_rate)

    # feature normalization
    lf0 = audio.f0_normalize(f0)
    mgc = audio.sp_normalize(sp, hparams)
    bap = audio.ap_normalize(enc_ap)
    # print(lf0.dtype, mgc.dtype, bap.dtype, flush=True)
    # print(np.shape(lf0), np.shape(mgc), np.shape(bap), flush=True)
    # wav233 = audio.synthesize(lf0,mgc,bap,hparams)
    # audio.save_wav(wav233, "/home/zhangwenbo5/worklhf/english_voice_clone/Voice_Cloning_byid_pyworld/test_wav.wav", hparams.sample_rate)
    ######################

    # f0, sp, ap = pw.wav2world(wav, hparams.sample_rate)
    # f0 = f0.astype(np.float32)
    # sp = sp.astype(np.float32)
    # ap = ap.astype(np.float32)
    # # print(f0.dtype, sp.dtype, ap.dtype, flush=True)
    # # f0 /= 100.0
    # # sp *= 1000.0
    # n_frames = np.shape(f0)[0]
    # f0 = np.reshape(f0, [n_frames, 1])
    ###########################################

    # mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32) # [80, frame]
    # mel_frames = mel_spectrogram.shape[1]
    lf0 = np.reshape(lf0, [n_frames, 1])
    mel_spectrogram = np.concatenate((lf0, mgc, bap),
                                     axis=-1)  # [frame, 1+60+1], 1027

    # Skip utterances that are too long
    if n_frames > hparams.max_mel_frames and hparams.clip_mels_length:
        return None

    # Write the spectrogram, embed and audio to disk
    np.save(mel_fpath, mel_spectrogram, allow_pickle=False)
    np.save(wav_fpath, wav, allow_pickle=False)

    # Return a tuple describing this training example
    embed_basename = basename
    if random_uttBasename_forSpkEmbedding is not None:
        embed_basename = random_uttBasename_forSpkEmbedding
    return wav_fpath.name, mel_fpath.name, "embed-%s.npy" % embed_basename, len(
        wav), n_frames, text
Beispiel #27
0
    def collect_features(self, wav_path, label_path):
        labels = hts.load(label_path)
        l_features = fe.linguistic_features(
            labels, self.binary_dict, self.continuous_dict,
            add_frame_features=True,
            subphone_features="coarse_coding")

        f0_score = _midi_to_hz(l_features, self.pitch_idx, False)
        notes = l_features[:, self.pitch_idx]
        notes = notes[notes > 0]
        # allow 1-tone upper/lower
        min_f0 = librosa.midi_to_hz(min(notes) - 2)
        max_f0 = librosa.midi_to_hz(max(notes) + 2)
        assert max_f0 > min_f0

        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)

        if self.use_harvest:
            f0, timeaxis = pyworld.harvest(x, fs, frame_period=self.frame_period,
            f0_floor=min_f0, f0_ceil=max_f0)
        else:
            f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period,
                f0_floor=min_f0, f0_ceil=max_f0)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs, f0_floor=self.f0_floor)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram, order=self.mgc_order,
                           alpha=pysptk.util.mcepalpha(fs))
        # F0 of speech
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if self.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        # Adjust lengths
        mgc = mgc[:labels.num_frames()]
        lf0 = lf0[:labels.num_frames()]
        vuv = vuv[:labels.num_frames()]
        bap = bap[:labels.num_frames()]

        if self.relative_f0:
            # # F0 derived from the musical score
            f0_score = f0_score[:, None]
            lf0_score = f0_score.copy()
            nonzero_indices = np.nonzero(f0_score)
            lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices])
            lf0_score = interp1d(lf0_score, kind="slinear")
            # relative f0
            diff_lf0 = lf0 - lf0_score
            diff_lf0 = np.clip(diff_lf0, np.log(0.5), np.log(2.0))

            f0_target = diff_lf0
        else:
            f0_target = lf0

        mgc = apply_delta_windows(mgc, self.windows)
        f0_target = apply_delta_windows(f0_target, self.windows)
        bap = apply_delta_windows(bap, self.windows)

        features = np.hstack((mgc, f0_target, vuv, bap)).astype(np.float32)

        # Align waveform and features
        wave = x.astype(np.float32) / 2**15
        T = int(features.shape[0] * (fs * self.frame_period / 1000))
        if len(wave) < T:
            if T - len(wave) > 100:
                print("Warn!!", T, len(wave), T-len(wave))
                print("you have unepxcted input. Please debug though ipdb")
                import ipdb; ipdb.set_trace()
            else:
                pass
            wave = np.pad(wave, (0, T-len(wave)))
        assert wave.shape[0] >= T
        wave = wave[:T]

        return features, wave
Beispiel #28
0
def ap2bap(ap, fs):
    bap = pw.code_aperiodicity(ap, fs)
    return bap
Beispiel #29
0
    def decode_RNN(wav_list, gpu, cvlist=None, cvlist_src=None, \
        mcd_cvlist_src=None, mcdstd_cvlist_src=None, mcdpow_cvlist_src=None, mcdpowstd_cvlist_src=None):
        with torch.cuda.device(gpu):
            mean_trg = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/mean_feat_org_lf0")[config.stdim:]).cuda()
            std_trg = torch.FloatTensor(
                read_hdf5(args.stats_jnt,
                          "/scale_feat_org_lf0")[config.stdim:]).cuda()
            # define model and load parameters
            logging.info(config)
            logging.info("model")
            with torch.no_grad():
                model_encoder = GRU_RNN_STOCHASTIC(
                    in_dim=config.in_dim,
                    out_dim=config.lat_dim,
                    hidden_layers=config.hidden_layers,
                    hidden_units=config.hidden_units,
                    kernel_size=config.kernel_size_enc,
                    dilation_size=config.dilation_size_enc,
                    arparam=config.arparam,
                    spk_dim=n_spk,
                    causal_conv=config.causal_conv,
                    scale_out_flag=False)
                model_decoder = GRU_RNN(in_dim=config.lat_dim + n_spk,
                                        out_dim=config.out_dim,
                                        hidden_layers=config.hidden_layers,
                                        hidden_units=config.hidden_units,
                                        kernel_size=config.kernel_size_dec,
                                        dilation_size=config.dilation_size_dec,
                                        causal_conv=config.causal_conv,
                                        scale_in_flag=False)
                logging.info(model_encoder)
                logging.info(model_decoder)
                model_encoder.load_state_dict(
                    torch.load(args.model)["model_encoder"])
                model_decoder.load_state_dict(
                    torch.load(args.model)["model_decoder"])
                model_encoder.cuda()
                model_decoder.cuda()
                model_encoder.eval()
                model_decoder.eval()
                for param in model_encoder.parameters():
                    param.requires_grad = False
                for param in model_decoder.parameters():
                    param.requires_grad = False
                if config.arparam:
                    init_pp = np.zeros((1, 1, config.lat_dim * 2 + n_spk))
                else:
                    init_pp = np.zeros((1, 1, config.lat_dim + n_spk))
                y_in_pp = torch.FloatTensor(init_pp).cuda()
                y_in_src = y_in_trg = torch.unsqueeze(
                    torch.unsqueeze((0 - mean_trg) / std_trg, 0), 0)
            fs = args.fs
            fft_size = args.fftl
            mcep_dim = model_decoder.out_dim - 1
            for wav_file in wav_list:
                # convert mcep
                feat_file = os.path.join(
                    args.h5outdir,
                    os.path.basename(wav_file).replace(".wav", ".h5"))
                logging.info("cvmcep " + feat_file + " " + wav_file)

                fs, x = read_wav(wav_file, cutoff=70)

                time_axis, f0, sp, ap = analyze_range(x, fs=fs, minf0=args.minf0, maxf0=args.maxf0, \
                                                        fperiod=args.shiftms, fftl=args.fftl)
                logging.info(sp.shape)

                mcep = ps.sp2mc(sp, mcep_dim, args.mcep_alpha)
                logging.info(mcep.shape)
                codeap = pw.code_aperiodicity(ap, fs)
                logging.info(codeap.shape)

                npow = spc2npow(sp)
                logging.info(npow.shape)
                _, spcidx = extfrm(mcep, npow, power_threshold=args.pow)
                spcidx = spcidx[0]
                logging.info(spcidx.shape)

                uv, contf0 = convert_continuos_f0(np.array(f0))
                uv = np.expand_dims(uv, axis=-1)
                logging.info(uv.shape)
                cont_f0_lpf = low_pass_filter(contf0,
                                              int(1.0 /
                                                  (args.shiftms * 0.001)),
                                              cutoff=LP_CUTOFF)
                logcontf0 = np.expand_dims(np.log(cont_f0_lpf), axis=-1)
                logging.info(logcontf0.shape)
                feat = np.c_[uv, logcontf0, codeap, mcep]
                logging.info(feat.shape)

                logging.info("generate")
                with torch.no_grad():
                    lat_feat_src, _, _, _, _ = \
                        model_encoder(torch.FloatTensor(feat).cuda(), y_in_pp, sampling=False)

                    src_code = np.zeros((lat_feat_src.shape[0], n_spk))
                    src_code[:, src_code_idx] = 1
                    src_code = torch.FloatTensor(src_code).cuda()

                    trg_code = np.zeros((lat_feat_src.shape[0], n_spk))
                    trg_code[:, trg_code_idx] = 1
                    trg_code = torch.FloatTensor(trg_code).cuda()

                    cvmcep_src, _, _ = model_decoder(
                        torch.cat((src_code, lat_feat_src), 1), y_in_src)
                    cvmcep_src = np.array(cvmcep_src.cpu().data.numpy(),
                                          dtype=np.float64)

                    cvmcep, _, _ = model_decoder(
                        torch.cat((trg_code, lat_feat_src), 1), y_in_trg)
                    cvmcep = np.array(cvmcep.cpu().data.numpy(),
                                      dtype=np.float64)

                logging.info(lat_feat_src.shape)
                logging.info(cvmcep_src.shape)
                logging.info(cvmcep.shape)

                cvf0 = convert_f0(f0, f0_range_mean_src, f0_range_std_src,
                                  f0_range_mean_trg, f0_range_std_trg)
                uv_cv, contf0_cv = convert_continuos_f0(np.array(cvf0))
                uv_cv = np.expand_dims(uv_cv, axis=-1)
                logging.info(uv_cv.shape)
                cont_f0_lpf_cv = low_pass_filter(contf0_cv,
                                                 int(1.0 /
                                                     (args.shiftms * 0.001)),
                                                 cutoff=LP_CUTOFF)
                logcontf0_cv = np.expand_dims(np.log(cont_f0_lpf_cv), axis=-1)
                logging.info(logcontf0_cv.shape)
                feat_cv = np.c_[uv_cv, logcontf0_cv, codeap]
                logging.info(feat_cv.shape)

                feat_cvmcep = np.c_[feat_cv, cvmcep]
                logging.info(feat_cvmcep.shape)
                write_path = '/feat_cvmcep_cycvae-' + model_epoch
                logging.info(feat_file + ' ' + write_path)
                write_hdf5(feat_file, write_path, feat_cvmcep)
                cvlist.append(np.var(cvmcep[:, 1:], axis=0))

                _, mcdpow_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),:], dtype=np.float64), \
                                                np.array(cvmcep_src[np.array(spcidx),:], dtype=np.float64))
                _, mcd_arr = dtw.calc_mcd(np.array(mcep[np.array(spcidx),1:], dtype=np.float64), \
                                            np.array(cvmcep_src[np.array(spcidx),1:], dtype=np.float64))
                mcdpow_mean = np.mean(mcdpow_arr)
                mcdpow_std = np.std(mcdpow_arr)
                mcd_mean = np.mean(mcd_arr)
                mcd_std = np.std(mcd_arr)
                logging.info("mcdpow_src_cv: %.6f dB +- %.6f" %
                             (mcdpow_mean, mcdpow_std))
                logging.info("mcd_src_cv: %.6f dB +- %.6f" %
                             (mcd_mean, mcd_std))
                mcdpow_cvlist_src.append(mcdpow_mean)
                mcdpowstd_cvlist_src.append(mcdpow_std)
                mcd_cvlist_src.append(mcd_mean)
                mcdstd_cvlist_src.append(mcd_std)
                cvlist_src.append(np.var(cvmcep_src[:, 1:], axis=0))

                logging.info("synth voco")
                cvsp = ps.mc2sp(np.ascontiguousarray(cvmcep), args.mcep_alpha,
                                fft_size)
                logging.info(cvsp.shape)
                wav = np.clip(
                    pw.synthesize(cvf0,
                                  cvsp,
                                  ap,
                                  fs,
                                  frame_period=args.shiftms), -1, 1)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(wav_file).replace(".wav", "_cv.wav"))
                sf.write(wavpath, wav, fs, 'PCM_16')
                logging.info(wavpath)

                logging.info("synth anasyn")
                wav = np.clip(
                    pw.synthesize(f0, sp, ap, fs, frame_period=args.shiftms),
                    -1, 1)
                wavpath = os.path.join(
                    args.outdir,
                    os.path.basename(wav_file).replace(".wav", "_anasyn.wav"))
                sf.write(wavpath, wav, fs, 'PCM_16')
                logging.info(wavpath)
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    #x, fs = sf.read('utterance/vaiueo2d.wav')
    x, fs = sf.read('utterance/p226_002.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap.
    code_sp = pw.code_spectral_envelope(sp, fs, 80)
    code_ap = pw.code_aperiodicity(ap, fs)
    fft_size = (sp.shape[1] - 1) * 2
    rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size)
    rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size)
    y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs)
    print("fft size: {:d}".format(fft_size))
    print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0],
                                                code_sp.shape[1]))
    print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0],
                                                code_ap.shape[1]))

    # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms
    f0_xx, t_xx = pw.dio(x,
                         fs,
                         f0_floor=50.0,
                         f0_ceil=600.0,
                         channels_in_octave=2,
                         frame_period=12.5,
                         speed=args.speed)
    f0_xx = pw.stonemask(x, f0_xx, t_xx, fs)
    sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs)
    ap_xx = pw.d4c(x, f0_xx, t_xx, fs)
    code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80)
    code_ap_xx = pw.code_aperiodicity(ap_xx, fs)
    fft_size = (sp_xx.shape[1] - 1) * 2
    rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size)
    rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size)
    y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5)
    sf.write(
        'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav',
        y_r_xx, fs)
    print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0],
                                                   code_sp_xx.shape[1]))
    print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0],
                                                   code_ap_xx.shape[1]))

    # Comparison
    savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx])
    savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx])
    savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False)
    savefig('test/f0.png', [_f0, f0, f0_h, f0_xx])

    print('Please check "test" directory for output files')