Ejemplo n.º 1
0
def wav2pw(wavfile, sr=SR, fft_size=FFT_SIZE, frame_period=FRAME_PERIOD):
    x, _ = librosa.load(wavfile, sr=sr, mono=True, dtype=np.float64)
    _f0, t = pw.harvest(x, sr, frame_period=frame_period)
    f0 = pw.stonemask(x, _f0, t, sr)
    sp = pw.cheaptrick(x, f0, t, sr, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, sr, fft_size=fft_size)
    return f0, sp, ap
Ejemplo n.º 2
0
def get_f0(audio, sample_rate, frame_period=5, method='dio'):
    if isinstance(audio, torch.Tensor):
        if audio.ndim > 1:
            audio = audio[0]

        audio = audio.numpy()

    hop_size = int(frame_period * sample_rate / 1000)
    if method == 'dio':
        f0, _ = pw.dio(audio.astype(np.double),
                       sample_rate,
                       frame_period=frame_period)
    elif method == 'harvest':
        f0, _ = pw.harvest(audio.astype(np.double),
                           sample_rate,
                           frame_period=frame_period)
    elif method == 'swipe':
        f0 = pysptk.sptk.swipe(audio.astype(np.double),
                               sample_rate,
                               hopsize=hop_size)
    elif method == 'rapt':
        f0 = pysptk.sptk.rapt(audio.astype(np.double),
                              sample_rate,
                              hopsize=hop_size)
    else:
        raise ValueError(f'No such f0 extract method, {method}.')

    f0 = torch.from_numpy(f0)
    vuv = 1 * (f0 != 0.0)

    return f0, vuv
Ejemplo n.º 3
0
def world_extract(x, fs, f0min, f0max):
    # scale from [-1, 1] to [-32768, 32767]
    x = x * np.iinfo(np.int16).max

    x = np.array(x, dtype=np.float64)
    x = low_cut_filter(x, fs)

    # extract features
    f0, time_axis = pw.harvest(x,
                               fs,
                               f0_floor=f0min,
                               f0_ceil=f0max,
                               frame_period=MCEP_SHIFT)
    sp = pw.cheaptrick(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
    ap = pw.d4c(x, f0, time_axis, fs, fft_size=MCEP_FFTL)
    mcep = pysptk.sp2mc(sp, MCEP_DIM, MCEP_ALPHA)
    npow = spc2npow(sp)

    return {
        "sp": sp,
        "mcep": mcep,
        "ap": ap,
        "f0": f0,
        "npow": npow,
    }
Ejemplo n.º 4
0
def wav2world(wavfile, frame_period):
    wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64)
    if hp.use_harvest:
        f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period)
    else:
        f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(wav, f0, timeaxis, fs)

    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    hp.num_bap = bap.shape[1]
    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if hp.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    #print(mgc.shape,lf0.shape,vuv.shape,bap.shape)
    features = np.hstack((mgc, lf0, vuv, bap))
    return features.astype(np.float32)
Ejemplo n.º 5
0
def data_extraction(np_data, rate):
    np_data = np_data.astype(np.float)
    _f0, t = pw.harvest(np_data, rate)
    f0 = pw.stonemask(np_data, _f0, t, rate)
    sp = pw.cheaptrick(np_data, f0, t, rate)
    ap = pw.d4c(np_data, f0, t, rate)
    return f0, sp, ap
Ejemplo n.º 6
0
    def Conversion(self):
        print(f">Conversion")
        print(f"f0_rate:{self.f0_rate}, sp_rate:{self.sp_rate}")
        self.statusBar().showMessage(f'Start conversion')
        wavdata = self.streamer.get_all()
        if (len(self.wav) <= 0):
            reply = QMessageBox.information(
                self, "声変換",
                "変換前の音声データがありません\nStartボタンを押して録音するか\nファイル>変換前の音声データの読み込み から音声データを読み込んでください"
            )
            return
        self.saveconvAction.setEnabled(True)
        wavdata = np.frombuffer(wavdata, dtype='int16').astype(np.float64)
        f0, t = pw.harvest(wavdata, self.RATE)  # 基本周波数の抽出
        sp = pw.cheaptrick(wavdata, f0, t, self.RATE)  # スペクトル包絡の抽出
        ap = pw.d4c(wavdata, f0, t, self.RATE)  # 非周期性指標の抽出

        "ピッチシフト"
        modified_f0 = self.f0_rate * f0
        "フォルマントシフト(周波数軸の一様な伸縮)"
        modified_sp = np.zeros_like(sp)
        sp_range = int(modified_sp.shape[1] * self.sp_rate)
        for f in range(modified_sp.shape[1]):
            if (f < sp_range):
                if self.sp_rate >= 1.0:
                    modified_sp[:, f] = sp[:, int(f / self.sp_rate)]
                else:
                    modified_sp[:, f] = sp[:, int(self.sp_rate * f)]
            else:
                modified_sp[:, f] = sp[:, f]

        self.synth = pw.synthesize(modified_f0, modified_sp, ap, self.RATE)
        self.curve2.setData(self.synth / 32767.0)
        print(len(self.synth))
        self.statusBar().showMessage(f'Finish conversion')
Ejemplo n.º 7
0
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0,
                 maxf0, type):
    wav, sr = load(filename, sr=None)

    # get f0
    x = wav.astype(float)
    _f0, t = world.harvest(x,
                           sr,
                           f0_floor=minf0,
                           f0_ceil=maxf0,
                           frame_period=winstep * 1000)
    f0 = world.stonemask(x, _f0, t, sr)

    window_size = int(sr * winlen)
    hop_size = int(sr * winstep)

    # get mel
    if type == 'mcc':
        spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0)
        h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T
    else:
        h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size)
    h = np.vstack((h, f0))
    maxlen = len(x) // hop_size + 2
    h = repeat_last_padding(h, maxlen)
    id = os.path.basename(filename).replace(".wav", "")
    return (id, x, h)
Ejemplo n.º 8
0
def word_synthesis(word, file_name):
    if os.path.exists(f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav'):
        data, samplerate = sf.read(
            f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav')

        f0, timeaxis = pw.harvest(data, samplerate)
        f0_mask = pw.stonemask(data, f0, timeaxis, samplerate)
        spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate)
        aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate,
                                         pw.default_frame_period)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.wav',
                 synthesized_word, samplerate)
        savefig(
            f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.png',
            [data, synthesized_word], word)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate, 3.0)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.wav',
                 synthesized_word, samplerate)
        savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.png',
                [data, synthesized_word], word)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate, 20.0)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.wav',
                 synthesized_word, samplerate)
        savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.png',
                [data, synthesized_word], word)
Ejemplo n.º 9
0
def process_acoustic_parameters(sound, sound_position, word, file_name):
    file = f'{PROCESSED_SOUNDS_DIRECTORY}/{sound_position}/{file_name}_{word}_{sound}.wav'
    if os.path.exists(file):
        data, samplerate = sf.read(file)

        frame_period = 5  # ms
        hop_length = int(0.001 * samplerate * frame_period)

        f0_dio, timeaxis_dio = pw.dio(data, samplerate)
        f0, timeaxis = pw.harvest(
            data, samplerate, frame_period)
        f0_mask = pw.stonemask(data, f0, timeaxis, samplerate)
        spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate)
        aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate)

        f0_rapt = pysptk.sptk.rapt(
            data.astype(np.float32), samplerate, hop_length)

        f0_swipe = pysptk.sptk.swipe(
            data, samplerate, hop_length)

        plot_f0(sound,
                sound_position,
                [timeaxis, f0],
                [timeaxis_dio, f0_dio],
                [arange(len(f0_rapt), 0.005), f0_rapt],
                [arange(len(f0_swipe), 0.005), f0_swipe],
                f0_mask,
                f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/f0/{file_name}_{word}_{sound}.png')
        savefig(
            f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/spectral_envelop/{file_name}_{word}_{sound}.png', [spectral_envelop], sound)
        savefig(
            f'{PLOTS_SOUNDS_DIRECTORY}/{sound_position}/aperiodicity/{file_name}_{word}_{sound}.png', [aperiodicity], sound, log=False)
Ejemplo n.º 10
0
def estimate_word(word, name):
    if os.path.exists(f'rijeci_wav/{word}_{name}.wav'):
        f_bef, fs = sf.read(f'rijeci_wav/{word}_{name}.wav')

        f0, timeaxis = pw.harvest(f_bef, fs)
        f0_mask = pw.stonemask(f_bef, f0, timeaxis, fs)
        sp = pw.cheaptrick(f_bef, f0_mask, timeaxis, fs)
        ap = pw.d4c(f_bef, f0_mask, timeaxis, fs)
        y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-def.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-def.png',
                [f_bef, y], word)

        y = pw.synthesize(f0_mask, sp, ap, fs, 3.0)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}.png',
                [f_bef, y], word)

        y = pw.synthesize(f0_mask, sp, ap, fs, 20.0)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-20.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-20.png',
                [f_bef, y], word)
Ejemplo n.º 11
0
def world_decompose(wav, fs, frame_period=5.0):
    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)

    f0, timeaxis = pyworld.harvest(wav,
                                   fs,
                                   frame_period=frame_period,
                                   f0_floor=71.0,
                                   f0_ceil=800.0)

    # Finding Spectogram
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)

    # Finding aperiodicity
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    # Use this in Ipython to see plot
    # librosa.display.specshow(np.log(sp).T,
    #                          sr=fs,
    #                          hop_length=int(0.001 * fs * frame_period),
    #                          x_axis="time",
    #                          y_axis="linear",
    #                          cmap="magma")
    # colorbar()
    return f0, timeaxis, sp, ap
Ejemplo n.º 12
0
def worldDecompose(
        wave: np.ndarray,
        fs: int = SAMPLE_RATE,
        frame_period: float = 5.) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    '''
    音声をworldを用いてf0, spectral envelope, aperiodicityに分解

    Parameters
    ----------
    wave: np.ndarray
        音声の波形データ
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    frame_period: float, default 5.
        フレームの時間的間隔

    Returns
    -------
    f0: np.ndarray
        フレームの基本周波数[hz]
    sp: np.ndarray
        スペクトル包絡
    ap: np.ndarray
        非周期性指標
    '''
    wave = wave.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wave,
                                   fs,
                                   frame_period=frame_period,
                                   f0_floor=71.,
                                   f0_ceil=800.)
    sp = pyworld.cheaptrick(wave, f0, timeaxis, fs)
    ap = pyworld.d4c(wave, f0, timeaxis, fs)
    return f0, sp, ap
Ejemplo n.º 13
0
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs):
    """world声码器语音转为频谱。"""
    # 分布提取参数
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    ap_threshold = kwargs.get("ap_threshold", 0.85)
    f0_extractor = kwargs.get("f0_extractor", "dio")
    x = wav.astype(np.double)
    if f0_extractor == "dio":
        # 使用DIO算法计算音频的基频F0
        f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil)
    elif f0_extractor == "harvest":
        f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)
    else:
        f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)

    # 使用CheapTrick算法计算音频的频谱包络
    sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size)
    # SP降维
    sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num)

    # 计算aperiodic参数
    ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size)
    # AP降维
    ap_enc = pw.code_aperiodicity(ap, sr)
    return f0, sp_enc, ap_enc
Ejemplo n.º 14
0
    def analyze(self, x):
        """Analyze acoustic features based on WORLD

        analyze F0, spectral envelope, aperiodicity

        Paramters
        ---------
        x : array, shape (`T`)
            monoral speech signal in time domain

        Returns
        ---------
        f0 : array, shape (`T`,)
            F0 sequence
        spc : array, shape (`T`, `fftl / 2 + 1`)
            Spectral envelope sequence
        ap: array, shape (`T`, `fftl / 2 + 1`)
            aperiodicity sequence

        """
        f0, time_axis = pyworld.harvest(x,
                                        self.fs,
                                        f0_floor=self.minf0,
                                        f0_ceil=self.maxf0,
                                        frame_period=self.shiftms)
        spc = pyworld.cheaptrick(x, f0, time_axis, self.fs, fft_size=self.fftl)
        ap = pyworld.d4c(x, f0, time_axis, self.fs, fft_size=self.fftl)

        assert spc.shape == ap.shape
        return f0, spc, ap
Ejemplo n.º 15
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000,
                    endian='LITTLE') #, start=56640, stop=262560)

    sr = 32000
    if osr != sr:
        y = librosa.resample(y, osr, sr)

    #使用harvest算法计算音频的基频F0
    _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period)
    _f0 = pw.stonemask(y, _f0, t, sr)
    print(_f0.shape)

    #使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = code_harmonic(_sp, 60)
    print(_sp.shape, code_sp.shape)
    #计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    return _f0, _sp, code_sp, _ap, code_ap
Ejemplo n.º 16
0
def harvest(cmd):
    docid = cmd["id"]

    meta = rec_set.get_meta(docid)

    x, fs = librosa.load(os.path.join(get_attachpath(), meta["path"]), sr=None)
    print("SYSTEM: harvesting...")
    hv_start = time.time()
    f0, timeaxis = pyworld.harvest(x.astype(np.float64), fs)
    print(f"SYSTEM: finished harvesting! (took {time.time() - hv_start:.2f}s)")

    with tempfile.NamedTemporaryFile(suffix=".txt", delete=False,
                                     mode="w") as harvest_fp:
        for i in range(len(timeaxis)):
            harvest_fp.write(f'{timeaxis[i]} {f0[i]}\n')

    if len(open(harvest_fp.name).read().strip()) == 0:
        return {"error": "Harvest computation failed"}

    # XXX: frozen attachdir
    harvesthash = guts.attach(harvest_fp.name, get_attachpath())

    guts.bschange(
        rec_set.dbs[docid],
        {
            "type": "set",
            "id": "meta",
            "key": "harvest",
            "val": harvesthash
        },
    )

    return {"harvest": harvesthash}
Ejemplo n.º 17
0
def analyze_range(wav,
                  fs=FS,
                  minf0=MINF0,
                  maxf0=MAXF0,
                  fperiod=SHIFTMS,
                  fftl=FFTL,
                  f0=None,
                  time_axis=None):
    if f0 is None or time_axis is None:
        #logging.info("%lf %lf %lf %lf" % (minf0, maxf0, fperiod, fftl))
        #logging.info("1")
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=minf0,
                                    f0_ceil=maxf0,
                                    frame_period=fperiod)
        #_f0, time_axis = pw.harvest(wav, fs, f0_floor=60, f0_ceil=maxf0, frame_period=fperiod)
        #_f0, time_axis = pw.harvest(wav, fs, f0_floor=60, frame_period=fperiod)
        #_f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, frame_period=fperiod)
        #_f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, frame_period=fperiod)
        #logging.info("2")
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
        #logging.info("3")
        #f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, f0_ceil=maxf0, frame_period=fperiod)
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    #logging.info("4")
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)
    #logging.info("5")

    return time_axis, f0, sp, ap
def analyze(wav,
            fs=FS,
            minf0=MINF0,
            maxf0=MAXF0,
            fperiod=SHIFTMS,
            fftl=FFTL,
            f0=None,
            time_axis=None):
    """
    f0 estimation w/o f0_floor & f0_ceil
    Args:
        minf0: Never used
        maxf0: Never used
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=60.0,
                                    frame_period=fperiod)
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
Ejemplo n.º 19
0
def world_decompose(wav, fs, frame_period = 5.0):
    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)
    return f0, timeaxis, sp, ap
Ejemplo n.º 20
0
def pre_process(file_name, training_dir):

    audio_file_name = training_dir + file_name + '.wav'
    lyrics_file_name = training_dir + 'Transcripts/' + file_name + '.txt'

    audio_data, sample_rate = soundfile.read(audio_file_name)
    audio_data = librosa.resample(audio_data, sample_rate, params.sample_rate)
    sample_rate = params.sample_rate

    harvest_frequency, timing = pyworld.harvest(
        audio_data,
        sample_rate,
        f0_floor=params.min_freq,
        f0_ceil=params.max_freq,
        frame_period=params.frame_period)
    frequency = pyworld.stonemask(audio_data, harvest_frequency, timing,
                                  sample_rate)
    audio_length = len(frequency)

    phoneme_data = extract_phoneme_data(
        [audio_file_name, lyrics_file_name, audio_length])

    frequency_data = process_frequency(frequency)

    label_data = pd.concat([phoneme_data, frequency_data], axis=1)

    spectral_data, aperiodic_data = extract_timbre_data(
        [audio_data, frequency, timing, sample_rate])

    return [spectral_data, aperiodic_data, label_data, frequency]
Ejemplo n.º 21
0
def wav2mcep(filepath):
    '''
    cal mcep given wav singnal

    return:
      f0: shape [ T, ]
      ap: shape [ T, sampling_rate/2 + 1 ]
      sp: shape [ T, sampling_rate/2 + 1 ]
      coded_sp: shape [n_mels, T]
    '''
    y, sr = librosa.load(filepath, sr=sampling_rate)
    y, _ = librosa.effects.trim(y)
    y = np.asarray(y, dtype=np.double)

    f0, timeaxis = pyworld.harvest(y, sr)
    sp = pyworld.cheaptrick(y, f0, timeaxis, sampling_rate, fft_size=n_fft)
    ap = pyworld.d4c(y, f0, timeaxis, sampling_rate, fft_size=n_fft)
    mcep = pyworld.code_spectral_envelope(sp, sampling_rate, n_mels)
    mcep = mcep.T  # dim x n

    f0 = f0.astype(np.float64)
    sp = sp.astype(np.float64)
    ap = ap.astype(np.float64)
    mcep = mcep.astype(np.float64)
    return f0, ap, sp, mcep
Ejemplo n.º 22
0
def world_features(wav, sr, fft_size, dim):
    f0, timeaxis = pyworld.harvest(wav, sr)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size)
    ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size)
    coded_sp = pyworld.code_spectral_envelope(sp, sr, dim)

    return f0, timeaxis, sp, ap, coded_sp
Ejemplo n.º 23
0
def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24):
    """
    Extract a F0 sequence and a MCEP sequence from a single waveform

    Args:
        wav (np.ndarray(1,T)): waveform
        fs :
        frame_period (float): [ms]
        MCEPdim (int): dimension of Mel CEPstral analysis

    Returns:
        tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period))
    """
    wav = wav.astype(np.float64)  # np.ndarray -> np.ndarray(number is float64)
    f0seq, timeaxis = pyworld.harvest(wav,
                                      fs,
                                      frame_period=frame_period,
                                      f0_floor=71.0,
                                      f0_ceil=800.0)
    spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs)
    MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim)
    print(
        f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}"
    )
    return f0seq, MCEPseq.T.astype(np.float32)
Ejemplo n.º 24
0
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE):
    '''cal mcep given wav singnal
        the frame_period used only for pad_wav_to_get_fixed_frames
    '''
    if ispad:
        wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs)
    else:
        wav = wav_ori
    #Harvest F0 extraction algorithm.
    f0, timeaxis = pyworld.harvest(wav, fs)

    #CheapTrick harmonic spectral envelope estimation algorithm.
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)

    #D4C aperiodicity estimation algorithm.
    ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
    #feature reduction nxdim
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    #log
    coded_sp = coded_sp.T  # dim x n

    res = {
        'f0': f0,  #n
        'ap': ap,  #n*fftsize//2+1
        'sp': sp,  #n*fftsize//2+1
        'coded_sp': coded_sp,  #dim * n
    }
    return res
def world_decompose(wav, fs):
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, f0_floor=71.0, f0_ceil=800.0, frame_period=hp.duration)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap
Ejemplo n.º 26
0
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
def analyze_range(wav,
                  fs=FS,
                  minf0=MINF0,
                  maxf0=MAXF0,
                  fperiod=SHIFTMS,
                  fftl=FFTL,
                  f0=None,
                  time_axis=None):
    """
    f0 estimation w/ f0_floor & f0_ceil
    Args:
        f0: Given f0. If not provided, estimated by WORLD harvest/stonemask from waveform.
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        # pyworld.harvest: Estimate fo.
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=minf0,
                                    f0_ceil=maxf0,
                                    frame_period=fperiod)
        # pyworld.stonemask: Refine fo.
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    # pyworld.cheaptrick: Spectral envelope estimation.
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    # pyworld.d4c: Aperiodicity estimation.
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
Ejemplo n.º 28
0
def synthesis(ori_path, aim_sp, aim_spkid):
    print('synthesizing ...')
    wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR)
    sp_per_timeaxis_before = pw.cheaptrick(wav,
                                           f0,
                                           timeaxis,
                                           hp.SR,
                                           fft_size=hp.N_FFT)  # 1024 压缩到 513 维

    # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT)

    # print('f0.shape = ')
    # print(f0)

    ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    aim_decoded_sp = pw.decode_spectral_envelope(
        aim_sp, hp.SR, fft_size=hp.N_FFT)  # 转换/解码 后的sp:
    print('解码后的513维度的aim_decoded_sp = ')
    print(aim_decoded_sp.shape)
    print(aim_decoded_sp[399][:])

    synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR)
    print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav')
    librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav',
                             synwav,
                             sr=hp.SR)
Ejemplo n.º 29
0
def world_decompose(wav, fs=16000, frame_period = 5.0):
    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)    # f0_floor是基频的下限  f0_ceil是基频的上限
    #   frame_period是连续帧之间的间隔
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)
    return f0     # 返回基频  时间  频谱包络  非周期性
Ejemplo n.º 30
0
def eval_rmse_f0(x_r, x_s, sr, frame_len='5', method='swipe', tone_shift=None):
    # TODO: 要可以改動 frame len (ms) 或者 hop_size
    if method == 'harvest':
        f0_r, t = pw.harvest(x_r.astype(np.double), sr, frame_period=50)
        f0_s, t = pw.harvest(x_s.astype(np.double), sr, frame_period=50)
    elif method == 'dio':
        f0_r, t = pw.dio(x_r.astype(np.double), sr, frame_period=50)
        f0_s, t = pw.dio(x_s.astype(np.double), sr, frame_period=50)
    elif method == 'swipe':
        f0_r = pysptk.sptk.swipe(x_r.astype(np.double), sr, hopsize=128)
        f0_s = pysptk.sptk.swipe(x_s.astype(np.double), sr, hopsize=128)
    elif method == 'rapt':
        f0_r = pysptk.sptk.rapt(x_r.astype(np.double), sr, hopsize=128)
        f0_s = pysptk.sptk.rapt(x_s.astype(np.double), sr, hopsize=128)
    else:
        raise ValueError('no such f0 exract method')

    # length align
    f0_s = pad_to(f0_s, len(f0_r))

    # make unvoice / vooiced frame mask
    f0_r_uv = (f0_r == 0) * 1
    f0_r_v = 1 - f0_r_uv
    f0_s_uv = (f0_s == 0) * 1
    f0_s_v = 1 - f0_s_uv

    tp_mask = f0_r_v * f0_s_v
    tn_mask = f0_r_uv * f0_s_uv
    fp_mask = f0_r_uv * f0_s_v
    fn_mask = f0_r_v * f0_s_uv

    if tone_shift is not None:
        shift_scale = 2**(tone_shift / 12)
        f0_r = f0_r * shift_scale

    # only calculate f0 error for voiced frame
    y = 1200 * np.abs(np.log2(f0_r + f0_r_uv) - np.log2(f0_s + f0_s_uv))
    y = y * tp_mask
    # print(y.sum(), tp_mask.sum())
    f0_rmse_mean = y.sum() / tp_mask.sum()

    # only voiced/ unvoiced accuracy/precision
    vuv_precision = tp_mask.sum() / (tp_mask.sum() + fp_mask.sum())
    vuv_accuracy = (tp_mask.sum() + tn_mask.sum()) / len(y)

    return f0_rmse_mean, vuv_accuracy, vuv_precision
Ejemplo n.º 31
0
def world_decompose(wav, fs, frame_period = 5.0):

    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap
Ejemplo n.º 32
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')