Esempio n. 1
0
def mcep_dir(srcroot, tgtroot, n_mcep=40, alpha=0.42):
    src = pathlib.Path(srcroot)
    tgt = pathlib.Path(tgtroot)
    if not pathlib.Path(src).exists():
        raise ValueError('src not exists: {}'.format(src))

    for p in sorted(src.glob('**/*.wav')):
        print(p)
        tgt_dir = tgt / p.parent.relative_to(src)
        tgt_stem = (tgt_dir / p.name).with_suffix('')
        tgt_dir.mkdir(parents=True, exist_ok=True)
        mcep_path = tgt_stem.with_suffix('.mcep.npy')
        c0_path = tgt_stem.with_suffix('.c0.npy')
        f0_path = tgt_stem.with_suffix('.f0.npy')
        ap_path = tgt_stem.with_suffix('.ap.npy')
        if mcep_path.exists() and c0_path.exists() and f0_path.exists(
        ) and ap_path.exists():
            print('skip')
            continue

        sr, wav = wavfile.read(p)
        x = (wav / 32768.0).astype(np.float64)
        f0, sp, ap = pyworld.wav2world(x.astype(np.float64), sr)
        mfcc = pysptk.sp2mc(sp, n_mcep, alpha)
        f0, mfcc, ap = f0.astype(np.float32), mfcc.T.astype(
            np.float32), ap.T.astype(np.float32)
        c0 = mfcc[0, :]
        mfcc = np.ascontiguousarray(mfcc[1:, :])
        ap = ap[192, :]

        np.save(mcep_path, mfcc)
        np.save(c0_path, c0)
        np.save(f0_path, f0)
        np.save(ap_path, ap)
        print(tgt_stem, flush=True)
Esempio n. 2
0
def world_extract(x, fs, f0min, f0max):
    # scale from [-1, 1] to [-32768, 32767]
    x = x * np.iinfo(np.int16).max

    x = np.array(x, dtype=np.float64)
    x = low_cut_filter(x, fs)

    # extract features
    f0, time_axis = pw.harvest(x,
                               fs,
                               f0_floor=f0min,
                               f0_ceil=f0max,
                               frame_period=MCEP_SHIFT)
    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
    ap = pw.d4c(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
    mcep = pysptk.sp2mc(sp, MCEP_DIM, MCEP_ALPHA)
    npow = spc2npow(sp)

    return {
        "sp": sp,
        "mcep": mcep,
        "ap": ap,
        "f0": f0,
        "npow": npow,
    }
Esempio n. 3
0
 def __test(order, alpha, fftlen):
     np.random.seed(98765)
     sp = np.random.rand(int(fftlen // 2 + 1))
     mc = pysptk.sp2mc(sp, order, alpha)
     approx_sp = pysptk.mc2sp(mc, alpha, fftlen)
     # TODO: tolerance should be more carefully chosen
     assert np.allclose(sp, approx_sp, atol=0.9)
Esempio n. 4
0
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
Esempio n. 5
0
 def collect_features(self, path):
     fs, x = wavfile.read(path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
     return mc
Esempio n. 6
0
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
Esempio n. 7
0
 def collect_features(self, wav_path):
     
     # x: Raw audio, (Sample_length, )
     x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64)
     
     
     # f0: F0, (Frame_length, ) 
     # lf0: log(f0) --> interp1d (Frame_length, )
     # vuv: voice/unvoiced (Frame_length, )
     f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)])
     lf0 = interp1d(lf0, kind="slinear")
     vuv = (lf0 != 0).astype(np.float32)
     
     
     # spec: Spectrogram, (Frame_length x Dim), Dim = 513
     # bap: coded aperiodicity, (Frame_length, )
     # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60
     spec = pyworld.cheaptrick(x, f0, timeaxis, fs)
     aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)        
     bap = pyworld.code_aperiodicity(aperiodicity, fs)
     mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs))
     
     
     # Stacking Features: total dimesnion = 64
     features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec))
     return features.astype(np.float32)
Esempio n. 8
0
    def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced: numpy.ndarray = ~(f0 == 0)

        if len(x) % fft_length > 0:
            f0 = f0[:-1]
            t = t[:-1]
            sp = sp[:-1]
            ap = ap[:-1]
            mc = mc[:-1]
            coded_ap = coded_ap[:-1]
            voiced = voiced[:-1]

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        return feature
Esempio n. 9
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Esempio n. 10
0
def wav2world(wavfile, frame_period):
    wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64)
    if hp.use_harvest:
        f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period)
    else:
        f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(wav, f0, timeaxis, fs)

    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    hp.num_bap = bap.shape[1]
    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if hp.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    #print(mgc.shape,lf0.shape,vuv.shape,bap.shape)
    features = np.hstack((mgc, lf0, vuv, bap))
    return features.astype(np.float32)
Esempio n. 11
0
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0,
                 maxf0, type):
    wav, sr = load(filename, sr=None)

    # get f0
    x = wav.astype(float)
    _f0, t = world.harvest(x,
                           sr,
                           f0_floor=minf0,
                           f0_ceil=maxf0,
                           frame_period=winstep * 1000)
    f0 = world.stonemask(x, _f0, t, sr)

    window_size = int(sr * winlen)
    hop_size = int(sr * winstep)

    # get mel
    if type == 'mcc':
        spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0)
        h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T
    else:
        h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size)
    h = np.vstack((h, f0))
    maxlen = len(x) // hop_size + 2
    h = repeat_last_padding(h, maxlen)
    id = os.path.basename(filename).replace(".wav", "")
    return (id, x, h)
Esempio n. 12
0
def collect_features(x, fs):
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
def _process_feature(out_dir, index, wav_path, label_path):

    # get list of wav files
    wav_files = os.listdir(os.path.dirname(wav_path))
    # check wav_file
    assert len(
        wav_files) != 0 and wav_files[0][-4:] == '.wav', "no wav files found!"

    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    n_frames = len(f0)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=order,
                       alpha=pysptk.util.mcepalpha(fs))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")

    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    features = np.hstack((mgc, lf0, vuv, bap))

    # get list of lab files
    lab_files = os.listdir(os.path.dirname(label_path))
    # check wav_file
    assert len(
        lab_files) != 0 and lab_files[0][-4:] == '.lab', "no lab files found!"

    # Cut silence frames by HTS alignment
    labels = hts.load(label_path)
    features = features[:labels.num_frames()]
    indices = labels.silence_frame_indices()
    features = np.delete(features, indices, axis=0)
    voiced_frames = features.shape[0]

    # Write the acoustic to disk:
    acoustic_filename = 'arctic_%05d.npy' % index
    np.save(os.path.join(out_dir, acoustic_filename),
            features.astype(np.float32),
            allow_pickle=False)

    dataset_ids.append(acoustic_filename[:-4])
    with open(os.path.join(os.path.dirname(out_dir), 'dataset_ids.pkl'),
              'wb') as pklFile:
        pickle.dump(dataset_ids, pklFile)

    # Return a tuple describing this training example:
    return (acoustic_filename, n_frames, voiced_frames)
Esempio n. 14
0
def get_features(wav_path):
    x, fs = librosa.load(wav_path, sr=config.fs)
    x = x.astype(np.float64)
    f0, time_axis = pyworld.dio(x, fs, frame_period=config.frame_period)
    f0 = pyworld.stonemask(x, f0, time_axis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs)
    aperiodicity = pyworld.d4c(x, f0, time_axis, fs)
    mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha)
    return mc, aperiodicity, f0
Esempio n. 15
0
 def collect_features(self, path):
     fs, x = wavfile.read(path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=5)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = trim_zeros_frames(spectrogram)
     mc = pysptk.sp2mc(spectrogram, order=24, alpha=self.alpha)
     return mc.astype(np.float32)
Esempio n. 16
0
 def collect_features(self, path):
     x, fs = librosa.load(path, sr=config.fs)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = trim_zeros_frames(spectrogram)
     mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha)
     return mc
Esempio n. 17
0
    def get_MCEPs(self, wav):
        wav = np.float64(wav)
        _f0_h, t_h = pw.harvest(wav, self.sr)
        f0_h = pw.stonemask(wav, _f0_h, t_h, self.sr)
        sp_h = pw.cheaptrick(wav, f0_h, t_h, self.sr)
        ap_h = pw.d4c(wav, f0_h, t_h, self.sr)
        mc = pysptk.sp2mc(sp_h, order=self.order, alpha=self.alpha)

        return mc, f0_h, ap_h
Esempio n. 18
0
def spec_to_waveform(spectrogram, order, fs, frame_period):
    alpha = pysptk.util.mcepalpha(fs)
    hop_length = int(fs * (frame_period * 0.001))
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length)
    b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
    waveform = engine.synthesis(x, b)

    return waveform
Esempio n. 19
0
def _get_mcep(x, fs, frame_period=5, order=24):
    alpha = pysptk.util.mcepalpha(fs)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    spectrogram = trim_zeros_frames(spectrogram)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
Esempio n. 20
0
def get_feature(wav_path, preprocessing=False, getsize=False):
    fs, x = wavfile.read(wav_path)
    x = x.astype(np.float64)
    if audio_world_config.use_harvest:
        f0, timeaxis = pyworld.harvest(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
    else:
        f0, timeaxis = pyworld.dio(
            x,
            fs,
            frame_period=audio_world_config.frame_period,
            f0_floor=audio_world_config.f0_floor,
            f0_ceil=audio_world_config.f0_ceil)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)

    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram,
                       order=audio_world_config.mgc_dim,
                       alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if audio_world_config.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    lf0 = P.interp1d(lf0, kind=audio_world_config.f0_interpolation_kind)

    # Parameter trajectory smoothing
    if audio_world_config.mod_spec_smoothing:
        hop_length = int(fs * (audio_world_config.frame_period * 0.001))
        modfs = fs / hop_length
        mgc = P.modspec_smoothing(
            mgc, modfs, cutoff=audio_world_config.mod_spec_smoothing_cutoff)

    mgc = P.delta_features(mgc, audio_world_config.windows)
    lf0 = P.delta_features(lf0, audio_world_config.windows)
    bap = P.delta_features(bap, audio_world_config.windows)

    features = np.hstack((mgc, lf0, vuv, bap))
    if preprocessing:
        out_path = wav_path.replace(".wav", "").replace("wav", "world")
        np.save(out_path, features)
    elif getsize:
        feature, mgc.shape[0], lf0.shape[0], bap.shape[0]
    else:
        return features
Esempio n. 21
0
def analyze(x, fs, f0_floor, f0_ceil, frame_period=20.0, pitchshift=None):
    if pitchshift is not None:
        f0, spc, ap = analyze_world(x, fs * pitchshift, f0_floor, f0_ceil, frame_period / pitchshift)
    else:
        f0, spc, ap = analyze_world(x, fs, f0_floor, f0_ceil, frame_period)
    mcep = pysptk.sp2mc(spc, 24, 0.410)
    codeap = pyworld.code_aperiodicity(ap, fs)
    
    #return x, fs, f0, time_axis, spc, ap, mcep, codeap
    return f0, mcep, codeap
def load_wav_extract_mcep(converted_dir, sent):
    if '.wav' in os.path.join(converted_dir, sent):
        wav, _ = librosa.load(os.path.join(converted_dir, sent),
                              sr=22050,
                              mono=True)
        _, _, sp, _ = world_decompose(wav=wav, fs=22050, frame_period=5.0)
        mcep = pysptk.sp2mc(sp, 36 - 1, 0.455)
        return mcep
    else:
        print('{}: not wav'.format(os.path.join(converted_dir, sent)))
Esempio n. 23
0
File: mcd.py Progetto: yjchoe/jejueo
def get_mc(wav):
    y, sr = sf.read(wav)
    y = y.astype(np.float64)
    f0, timeaxis = pyworld.dio(y, sr, frame_period=5)
    f0 = pyworld.stonemask(y, f0, timeaxis, sr)
    spectrogram = pyworld.cheaptrick(y, f0, timeaxis, sr)
    mc = pysptk.sp2mc(spectrogram, order=24, alpha=0.41)
    mc = mc.astype(np.float32)

    return mc
Esempio n. 24
0
def gaussian_voice_conversion(model,
                              audio_path,
                              windows=default_windows,
                              frame_period=default_frame_period,
                              order=default_order,
                              alpha=default_alpha,
                              hop_length=default_hop_length):

    paramgen = utilities.math.MLPG(model, windows=windows, diff=True)

    sampling_rate, audio_data = scipy.io.wavfile.read(audio_path)

    audio_data = audio_data.astype(numpy.float64)

    #
    fundamental_frequency, time_axis = pyworld.dio(audio_data,
                                                   sampling_rate,
                                                   frame_period=frame_period)

    fundamental_frequency = pyworld.stonemask(audio_data,
                                              fundamental_frequency, time_axis,
                                              sampling_rate)

    spectrogram = pyworld.cheaptrick(audio_data, fundamental_frequency,
                                     time_axis, sampling_rate)

    aperiodicity = pyworld.d4c(audio_data, fundamental_frequency, time_axis,
                               sampling_rate)

    #
    mel_coefficients = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)

    c0, mel_coefficients = mel_coefficients[:, 0], mel_coefficients[:, 1:]

    mel_coefficients = utilities.math.apply_delta(mel_coefficients, windows)

    mel_coefficients = paramgen.transform(mel_coefficients)

    mel_coefficients = numpy.hstack((c0[:, None], mel_coefficients))

    #
    mel_coefficients[:, 0] = 0

    engine = pysptk.synthesis.Synthesizer(pysptk.synthesis.MLSADF(order=order,
                                                                  alpha=alpha),
                                          hopsize=hop_length)

    mlsa_coefficients = pysptk.mc2b(mel_coefficients.astype(numpy.float64),
                                    alpha=alpha)

    waveform = engine.synthesis(audio_data, mlsa_coefficients)

    # The numpy.int16 is really important, otherwise it would
    # produce non-sensical wavefiles when saved with scipy
    return numpy.asarray(waveform, dtype=numpy.int16)
Esempio n. 25
0
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True):
    model.eval()

    fs, x = wavfile.read(path)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    # Apply model
    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)
    y_hat, y_hat_static = model(mc_scaled, R)
    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim],
                                 data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(mc.astype(np.float64),
                                   alpha=alpha,
                                   fftlen=fftlen)
        waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs,
                                      hp.frame_period)

    return waveform, inputs, outputs
Esempio n. 26
0
def get_features(x, fs):
    # f0 calculate
    _f0, t = pw.dio(x, fs)
    f0 = pw.stonemask(x, _f0, t, fs)
    # mcep calculate
    sp = trim_zeros_frames(pw.cheaptrick(x, f0, t, fs))
    mcep = pysptk.sp2mc(sp, order=24, alpha=pysptk.util.mcepalpha(fs))
    # bap calculate
    ap = pw.d4c(x, f0, t, fs)
    bap = pw.code_aperiodicity(ap, fs)
    return f0, mcep, bap
Esempio n. 27
0
def wav2mcep(WAV_FILE,dim):
    fs, data = wavfile.read(WAV_FILE)

    # floatでないとworldは扱えない
    data = data.astype(np.float)

    _f0, _time = pw.dio(data, fs)    # 基本周波数の抽出。pw.dioは0.005秒ごとの基本周波数を測定し、numpyとして返す。
    f0 = pw.stonemask(data, _f0, _time, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(data, f0, _time, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(data, f0, _time, fs)         # 非周期性指標の抽出
    mcep=pysptk.sp2mc(sp,dim,0.42)
    return torch.Tensor(mcep)
Esempio n. 28
0
def mgc_lf0_vuv(f0, sp, ap, fs=22050, order=13, alpha=None):
    if alpha is None:
        alpha=pysptk.util.mcepalpha(fs)
    #  https://github.com/r9y9/gantts/blob/master/prepare_features_tts.py
    mgc = pysptk.sp2mc(sp, order=order, alpha=alpha)
    # f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (ap[:, 0] < 0.5).astype(np.float32)[:, None]

    return mgc, lf0[:, None], vuv
Esempio n. 29
0
    def feature_extract(wav_list, arr):
        n_sample = 0
        n_frame = 0
        max_frame = 0
        count = 1
        coeff = np.array([-0.5, 0.5, 0.0])
        for wav_name in wav_list:
            # load wavfile and apply low cut filter
            fs, x = read_wav(wav_name, cutoff=70)
            n_sample += x.shape[0]
            logging.info(wav_name + " " + str(x.shape[0]) + " " +
                         str(n_sample) + " " + str(count))

            # check sampling frequency
            if not fs == args.fs:
                logging.debug("ERROR: sampling frequency is not matched.")
                sys.exit(1)

            hdf5name = args.hdf5dir + "/" + os.path.basename(wav_name).replace(
                ".wav", ".h5")

            # extimate f0 and ap
            time_axis, f0, spc, ap = analyze_range(x,
                                                   fs=args.fs,
                                                   minf0=minf0,
                                                   maxf0=maxf0,
                                                   fperiod=args.shiftms,
                                                   fftl=args.fftl)
            write_hdf5(hdf5name, '/ap', ap)
            write_hdf5(hdf5name, "/f0", f0)

            # convert to continuous f0 and low-pass filter
            uv, cont_f0 = convert_continuos_f0(np.array(f0))
            cont_f0_lpf = low_pass_filter(cont_f0,
                                          int(1.0 / (args.shiftms * 0.001)),
                                          cutoff=20)

            cont_f0_lpf = np.expand_dims(cont_f0_lpf, axis=-1)
            uv = np.expand_dims(uv, axis=-1)

            write_hdf5(hdf5name, "/lcf0", np.log(cont_f0_lpf))
            write_hdf5(hdf5name, "/uv", uv)

            # extimate codeap
            codeap = pw.code_aperiodicity(ap, args.fs)
            if codeap.ndim == 1:
                # when fs == 16000
                codeap = np.expand_dims(codeap, axis=-1)
            write_hdf5(hdf5name, "/codeap", codeap)

            # mcep
            mcep = ps.sp2mc(spc, args.mcep_dim, mcep_alpha)
            write_hdf5(hdf5name, "/mcep", mcep)
def process(filename):
    '''
    The function decomposes a wav file into F0, mel-cepstral coefficients, and aperiodicity
    :param filename: path to wav file
    :return: .lf0, .mgc and .bap files
    '''
    # pdb.set_trace()
    file_id = os.path.basename(filename).split(".")[0]
    print('\n' + file_id)

    ### WORLD ANALYSIS -- extract vocoder parameters ###
    # x, fs = librosa.core.load(filename, sr=16000)
    fs, x = wavfile.read(filename)
    # warnning this parameter is important
    alpha = pysptk.util.mcepalpha(fs)
    hopesize = int(0.005 * fs)
    # pdb.set_trace()
    f0 = pysptk.rapt(x.astype(np.float32),
                     fs=fs,
                     hopsize=hopesize,
                     min=60,
                     max=600,
                     voice_bias=0.0,
                     otype=1)
    f0 = f0.astype(np.float64)
    x = x.astype(np.float64) / (2**15)
    _, timeaxis = pyworld.harvest(x,
                                  fs,
                                  frame_period=5,
                                  f0_floor=60.0,
                                  f0_ceil=600)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    f0 = f0[:, None]
    lf0 = f0.copy()
    lf0 = lf0.astype(np.float32)
    nonzero_indices = np.where(f0 != 0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    zero_indices = np.where(f0 == 0)
    lf0[zero_indices] = -1.0E+10
    write_binfile(lf0,
                  os.path.join(lf0_dir, file_id + '.lf0'),
                  dtype=np.float32)
    mc = pysptk.sp2mc(spectrogram, mcsize, alpha=alpha)
    mc = mc.astype(np.float32)
    write_binfile(mc,
                  os.path.join(mgc_dir, file_id + '.mgc'),
                  dtype=np.float32)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    bap = bap.astype(np.float32)
    write_binfile(bap,
                  os.path.join(bap_dir, file_id + '.bap'),
                  dtype=np.float32)
Esempio n. 31
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Esempio n. 32
0
 def collect_features(self, wav_path):
     fs, x = wavfile.read(wav_path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = P.trim_zeros_frames(spectrogram)
     if self.alpha is None:
         self.alpha = pysptk.util.mcepalpha(fs)
     mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha)
     # Drop 0-th coefficient
     mgc = mgc[:, 1:]
     # 50Hz cut-off MS smoothing
     hop_length = int(fs * (hp.frame_period * 0.001))
     modfs = fs / hop_length
     mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)
     # Add delta
     mgc = P.delta_features(mgc, hp.windows)
     return mgc.astype(np.float32)
Esempio n. 33
0
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
Esempio n. 34
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs