Esempio n. 1
0
def test_world_array_order():
    wav = kwiiyatta.load_wav(dataset.CLB_WAV)

    f0, timeaxis = pyworld.dio(wav.data, wav.fs)
    f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs)
    spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs)
    ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs)
    pyworld.synthesize(f0, spec, ape, wav.fs)

    data = wav.data[::2]

    expected_msg = 'ndarray is not C-contiguous'
    with pytest.raises(ValueError) as e:
        f0, timeaxis = pyworld.dio(data, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.cheaptrick(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.d4c(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs)
    assert expected_msg == str(e.value)
Esempio n. 2
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
 def extractparam(self):
     global audio_file_name, x, fs, _f0, _sp, _ap
     x, fs = sf.read(audio_file_name)
     _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0)
     _sp = pw.cheaptrick(x, _f0, t, fs)
     _ap = pw.d4c(x, _f0, t, fs)
     print("done")
Esempio n. 5
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000,
                    endian='LITTLE') #, start=56640, stop=262560)

    sr = 32000
    if osr != sr:
        y = librosa.resample(y, osr, sr)

    #使用harvest算法计算音频的基频F0
    _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period)
    _f0 = pw.stonemask(y, _f0, t, sr)
    print(_f0.shape)

    #使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = code_harmonic(_sp, 60)
    print(_sp.shape, code_sp.shape)
    #计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    return _f0, _sp, code_sp, _ap, code_ap
Esempio n. 6
0
def pyworld_featurize(audiofile):

    fs, x = wav.read(audiofile)
    print(x)
    print(fs)
    # corrects for 2 channel audio
    try:
        x = x[:, 0]
    except:
        pass
    x = np.array(np.ascontiguousarray(x), dtype=np.double)
    print(fs)
    print(x)

    _f0, t = pw.dio(x, fs)  # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity)

    features_0, labels_0 = stats(_f0, 'pitch')
    features_1, labels_1 = stats(_f0, 'pitch_refinement')
    features_2, labels_2 = stats(sp, 'smoothed_spectrogram')
    features_3, labels_3 = stats(ap, 'aperiodicity')

    features_0 = list(features_0)
    features_1 = list(features_1)
    features_2 = list(features_2)
    features_3 = list(features_3)

    features = features_0 + features_1 + features_2 + features_3
    labels = labels_0 + labels_1 + labels_2 + labels_3

    return features, labels
Esempio n. 7
0
    def analyze(self, x):
        """Analyze acoustic features based on WORLD

        analyze F0, spectral envelope, aperiodicity

        Paramters
        ---------
        x : array, shape (`T`)
            monoral speech signal in time domain

        Returns
        ---------
        f0 : array, shape (`T`,)
            F0 sequence
        spc : array, shape (`T`, `fftl / 2 + 1`)
            Spectral envelope sequence
        ap: array, shape (`T`, `fftl / 2 + 1`)
            aperiodicity sequence

        """
        f0, time_axis = pyworld.harvest(x,
                                        self.fs,
                                        f0_floor=self.minf0,
                                        f0_ceil=self.maxf0,
                                        frame_period=self.shiftms)
        spc = pyworld.cheaptrick(x, f0, time_axis, self.fs, fft_size=self.fftl)
        ap = pyworld.d4c(x, f0, time_axis, self.fs, fft_size=self.fftl)

        assert spc.shape == ap.shape
        return f0, spc, ap
Esempio n. 8
0
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3):
    """
    WAV音声データから話者情報を取り除いたWAV音声データを作成
    label音声からinput音声作成用
    :param path:
    :param f0Value:
    :param sp_strechRatio:
    :return:
    """
    waveNDArray = waveNDArray.astype(np.float)
    _f0, t = pw.dio(waveNDArray, fs)  # 基本周波数の抽出
    f0 = pw.stonemask(waveNDArray, _f0, t, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(waveNDArray, f0, t, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(waveNDArray, f0, t, fs)  # 非周期性指標の抽出
    f0_fixed0 = np.ones(f0.shape) * f0Value
    f0_median = np.median(f0)
    sp_median = np.median(sp)
    ap_median = np.median(ap)
    # SPを高周波方向に伸縮
    sp2 = np.ones_like(sp)*np.min(sp)
    for f in range(sp2.shape[1]):
        if(int(f / sp_strechRatio) >= sp.shape[1]): break
        sp2[:, f] = sp[:, int(f / sp_strechRatio)]
    # SP/APに正規分布ノイズ
    sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape)
    ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape)
    #ガウシアンフィルタ
    sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s)
    ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s)
    # 音声復元
    synthesized = pw.synthesize(f0_fixed0, sp, ap, fs)
    return synthesized
Esempio n. 9
0
def get_conversion_data(audiodata, fs, refine_f0):
    """
    Get A (without warping source dictionary) feature for conversion (sp, ap, f0)
    :param args:
    :param kwargs:
    :return: source dictionary (without warping)
    """
    features = []

    logging.info("Start building speaker A dictionary: Extracting feature for conversion (sp, ap, f0)")
    for audio in tqdm(audiodata):
        # Extract feature
        _f0, t = pw.dio(audio, fs)  # raw pitch extractor

        if refine_f0:
            f0 = pw.stonemask(audio, _f0, t, fs)  # pitch refinement
        else:
            f0 = _f0

        sp = pw.cheaptrick(audio, f0, t, fs)  # extract smoothed spectrogram
        ap = pw.d4c(audio, f0, t, fs)  # extract aperiodicity
        # y = pw.synthesize(f0, sp, ap, fs)

        features.append({
            'sp': sp,
            'ap': ap,
            'f0': f0,
            'fs': fs,
            'sr': fs
        })

    return features
Esempio n. 10
0
 def extract_spectrum(self, x, sample_rate):
     x = np.asarray(x)
     _f0, t = pw.dio(x, sample_rate, frame_period=12.5)  # raw pitch extractor
     f0 = pw.stonemask(x, _f0, t, sample_rate)  # pitch refinement
     sp = pw.cheaptrick(x, f0, t, sample_rate)  # extract smoothed spectrogram
     ap = pw.d4c(x, f0, t, sample_rate)
     return sp, ap, f0
Esempio n. 11
0
def worldDecompose(
        wave: np.ndarray,
        fs: int = SAMPLE_RATE,
        frame_period: float = 5.) -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    '''
    音声をworldを用いてf0, spectral envelope, aperiodicityに分解

    Parameters
    ----------
    wave: np.ndarray
        音声の波形データ
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    frame_period: float, default 5.
        フレームの時間的間隔

    Returns
    -------
    f0: np.ndarray
        フレームの基本周波数[hz]
    sp: np.ndarray
        スペクトル包絡
    ap: np.ndarray
        非周期性指標
    '''
    wave = wave.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wave,
                                   fs,
                                   frame_period=frame_period,
                                   f0_floor=71.,
                                   f0_ceil=800.)
    sp = pyworld.cheaptrick(wave, f0, timeaxis, fs)
    ap = pyworld.d4c(wave, f0, timeaxis, fs)
    return f0, sp, ap
Esempio n. 12
0
def collect_features(x, fs):
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
Esempio n. 13
0
def world_decompose(wav, fs, frame_period = 5.0):
    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)
    return f0, timeaxis, sp, ap
Esempio n. 14
0
def extract_sp_world(x, f0, sr, hoptime, fft_size=None):
    # NOTE:
    # F0 values too low for used FFT size are handled as unvoiced in CheapTrick.
    # If this happens, we warn the user.

    f0 = f0.squeeze()
    f0 = np.ascontiguousarray(
        f0, dtype=np.float64)  # pyworld requires C-contiguous float64 array

    # warn for very low f0
    # fft_size = pyworld.get_cheaptrick_fft_size(sr) if fft_size is None else fft_size
    # f0_floor = pyworld.get_cheaptrick_f0_floor(sr, fft_size)
    # n_f0_too_low = int(np.sum(np.logical_and(f0 > 0, f0 <= f0_floor)))
    # if n_f0_too_low > 0:
    #     warnings.warn('F0 too low (<= {:.2f}) for FFT size ({:d}) for {:d} samples'.format(f0_floor, fft_size, n_f0_too_low))

    n_frames = f0.shape[0]
    t = np.arange(n_frames) * hoptime

    sp = pyworld.cheaptrick(x, f0, t, sr)

    if not np.all(np.isfinite(sp)):
        raise ValueError(
            'Configuration or input signal caused NaNs in WORLD CheapTrick analysis'
        )
        # NOTE: This seems to happen occassionally, e.g. taking 16kHz TIMIT audio, upsampling
        # it to 32kHz and performing WORLD analysis

    sp = 10 * np.log10(sp)  # power to decibels

    return sp
Esempio n. 15
0
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs):
    """world声码器语音转为频谱。"""
    # 分布提取参数
    frame_period = kwargs.get("frame_period", pw.default_frame_period)
    f0_floor = kwargs.get("f0_floor", pw.default_f0_floor)
    f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil)
    fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor))
    ap_threshold = kwargs.get("ap_threshold", 0.85)
    f0_extractor = kwargs.get("f0_extractor", "dio")
    x = wav.astype(np.double)
    if f0_extractor == "dio":
        # 使用DIO算法计算音频的基频F0
        f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil)
    elif f0_extractor == "harvest":
        f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)
    else:
        f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period)

    # 使用CheapTrick算法计算音频的频谱包络
    sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size)
    # SP降维
    sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num)

    # 计算aperiodic参数
    ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size)
    # AP降维
    ap_enc = pw.code_aperiodicity(ap, sr)
    return f0, sp_enc, ap_enc
Esempio n. 16
0
def synthesis(ori_path, aim_sp, aim_spkid):
    print('synthesizing ...')
    wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR)
    sp_per_timeaxis_before = pw.cheaptrick(wav,
                                           f0,
                                           timeaxis,
                                           hp.SR,
                                           fft_size=hp.N_FFT)  # 1024 压缩到 513 维

    # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT)

    # print('f0.shape = ')
    # print(f0)

    ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    aim_decoded_sp = pw.decode_spectral_envelope(
        aim_sp, hp.SR, fft_size=hp.N_FFT)  # 转换/解码 后的sp:
    print('解码后的513维度的aim_decoded_sp = ')
    print(aim_decoded_sp.shape)
    print(aim_decoded_sp[399][:])

    synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR)
    print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav')
    librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav',
                             synwav,
                             sr=hp.SR)
Esempio n. 17
0
 def collect_features(self, path):
     fs, x = wavfile.read(path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
     return mc
Esempio n. 18
0
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE):
    '''cal mcep given wav singnal
        the frame_period used only for pad_wav_to_get_fixed_frames
    '''
    if ispad:
        wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs)
    else:
        wav = wav_ori
    #Harvest F0 extraction algorithm.
    f0, timeaxis = pyworld.harvest(wav, fs)

    #CheapTrick harmonic spectral envelope estimation algorithm.
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size)

    #D4C aperiodicity estimation algorithm.
    ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size)
    #feature reduction nxdim
    coded_sp = pyworld.code_spectral_envelope(sp, fs, dim)
    #log
    coded_sp = coded_sp.T  # dim x n

    res = {
        'f0': f0,  #n
        'ap': ap,  #n*fftsize//2+1
        'sp': sp,  #n*fftsize//2+1
        'coded_sp': coded_sp,  #dim * n
    }
    return res
Esempio n. 19
0
def get_para(data, fs):
    # This function is the same as wav2world.
    _fo, _time = pw.dio(data, fs)               # 基本周波数の抽出
    fo = pw.stonemask(data, _fo, _time, fs)     # 基本周波数の修正
    sp = pw.cheaptrick(data, fo, _time, fs)     # スペクトル包絡の抽出
    ap = pw.d4c(data, fo, _time, fs)            # 非周期性指標の抽出
    return fo, sp, ap
    def formant(self, val, f0_v):
        '''
            Change formant.
            val : formant rate
            f0_v: f0 rate
        '''
        f_rate = self.audio.frame_rate
        np_arr = np.array(self.audio.get_array_of_samples(),
                          dtype=np.float64)  # pydub --> np.array(float64) 変換
        # print(np_arr, f_rate)
        _f0_val, _time = pyworld.dio(np_arr, f_rate)  # 基本周波数
        spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate)  # スペクトル包絡
        aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate)  # 非周期性指標
        spct_b = np.zeros_like(spct)
        for i in range(spct_b.shape[1]):
            spct_b[:, i] = spct[:, int(i / val)]
        ef_audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate)
        ef_audio = ef_audio.astype(np.int16).tobytes()

        # print(ef_audio)
        # print(type(ef_audio))
        new_audio = AudioSegment(
            ef_audio,
            sample_width=self.audio.sample_width,
            frame_rate=f_rate,
            channels=self.audio.channels,
        )
        self.audio = new_audio
        return self
Esempio n. 21
0
    def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced: numpy.ndarray = ~(f0 == 0)

        if len(x) % fft_length > 0:
            f0 = f0[:-1]
            t = t[:-1]
            sp = sp[:-1]
            ap = ap[:-1]
            mc = mc[:-1]
            coded_ap = coded_ap[:-1]
            voiced = voiced[:-1]

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        return feature
Esempio n. 22
0
def wave2world(data):
    """
    Parameters
    ----------
    data : float64
        SamplingRate: 44100
        ValueRange  : [-1.0,1.0]
        Shape: (input_size)
    Returns
    -------
    _f0 : float64
        Shape: (N)
    _cepstrum : float64
        Shape: (N, 64)
    _aperiodicity : float64
        Shape: (N,513)
    NOTE: input_size is defined in config file.
          N is determined by input_size.
    """
    sampling_rate = 44100
    _f0, _t = pw.dio(data, sampling_rate, frame_period=10)
    _f0 = pw.stonemask(data, _f0, _t, sampling_rate)
    _cepstrum = pw.cheaptrick(data, _f0, _t, sampling_rate)
    _cepstrum = (np.log(_cepstrum) + 7) / 9
    _cepstrum = np.clip(_cepstrum, -1.0, 1.0)
    _aperiodicity = pw.d4c(data, _f0, _t, sampling_rate)
    return _f0, _cepstrum.astype(np.float32), _aperiodicity
Esempio n. 23
0
def world_features(wav, sr, fft_size, dim):
    f0, timeaxis = pyworld.harvest(wav, sr)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size)
    ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size)
    coded_sp = pyworld.code_spectral_envelope(sp, sr, dim)

    return f0, timeaxis, sp, ap, coded_sp
def world_decompose(wav, fs):
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, f0_floor=71.0, f0_ceil=800.0, frame_period=hp.duration)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap
Esempio n. 25
0
def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24):
    """
    Extract a F0 sequence and a MCEP sequence from a single waveform

    Args:
        wav (np.ndarray(1,T)): waveform
        fs :
        frame_period (float): [ms]
        MCEPdim (int): dimension of Mel CEPstral analysis

    Returns:
        tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period))
    """
    wav = wav.astype(np.float64)  # np.ndarray -> np.ndarray(number is float64)
    f0seq, timeaxis = pyworld.harvest(wav,
                                      fs,
                                      frame_period=frame_period,
                                      f0_floor=71.0,
                                      f0_ceil=800.0)
    spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs)
    MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim)
    print(
        f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}"
    )
    return f0seq, MCEPseq.T.astype(np.float32)
Esempio n. 26
0
def wavfile2pw(filename, f0_ceil=F0_CEIL, fs=FS, fft_size=FFT_SIZE):
    """Speech analysis given the file name
  
  We use the PyWorld to extract feature, following the practice in:
  https://github.com/JeremyCCHsu/vae-npvc

  NOTE: The spectrum is normalized by energy and transformed to log scale. 
  To be discussed here 

  After transforming to the log scale, the spectrum will be further 
  normalized to be in the range of [-1, 1]
  
  Args:
    filename: the wav file 
    f0_ceil: maximum f0, note here we set the default to be 500, while praat 
      suggest we set 250. this will result in many small values in high frequence, probably not learnable for a network
    fs: sampling frequency, librosa will handle the frequency conversion
      from the original wavfile 
    fft_size: fft size

  Returns:
    f0: the pitch/ fundamental frequencys
    sp: spectogram
    ap: aperiodicity
    en: energy
  """
    x, _ = librosa.load(filename, sr=fs, mono=True, dtype=np.float64)
    _f0, t = pw.dio(x, fs, f0_ceil=f0_ceil)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return f0, sp, ap, en
Esempio n. 27
0
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
Esempio n. 28
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
def raw2WORLDfeatures(signal, fs=16000, fft_size=1024):
    _f0, t = pw.dio(signal, fs, f0_ceil=500)  # raw pitch contour extractor
    f0 = pw.stonemask(signal, _f0, t, fs)  # pitch refinement
    spectra = pw.cheaptrick(signal, f0, t, fs, fft_size=fft_size)
    aperiodicity = pw.d4c(signal, f0, t, fs,
                          fft_size=fft_size)  # extract aperiodicity
    return f0, spectra, aperiodicity
Esempio n. 30
0
def get_target(x, fs, n_ap_channels, n_sp_channels, f0):
    _f0, t = pw.dio(x, fs, f0_floor=75.0, f0_ceil=1000.0, frame_period=8.0)
    f0_herz = f0[:_f0.shape[0]]
    f0_herz[_f0 < 1.0] = 0.0
    sp = pw.cheaptrick(x, f0_herz, t, fs)
    ap = pw.d4c(x, f0_herz, t, fs)
    # print(sp.shape)

    # plt.matshow(ap)
    # plt.show()
    ap = ap * 20 - 18
    arr = []
    for i in range(sp.shape[0]):
        arr.append(
            np.interp(np.linspace(0, 1025, n_ap_channels), np.arange(1025),
                      ap[i])[np.newaxis, :])
    _ap = np.concatenate(arr, axis=0)

    sp = np.log(sp)
    # plt.matshow(sp)
    # plt.show()
    arr = []
    for i in range(sp.shape[0]):
        arr.append(
            np.interp(np.linspace(0, 1025, n_sp_channels), np.arange(1025),
                      sp[i])[np.newaxis, :])
    _sp = np.concatenate(arr, axis=0)

    #     mel=mel+20.0
    #     mel=np.where(mel>0,mel,0)
    #     mel=mel/mel.max()
    #     plt.matshow(mel)
    #     plt.show()

    return _ap, _sp, f0_herz
Esempio n. 31
0
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
def world_decompose(wav, fs, frame_period = 5.0):

    # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD
    wav = wav.astype(np.float64)
    f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0)
    sp = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    ap = pyworld.d4c(wav, f0, timeaxis, fs)

    return f0, timeaxis, sp, ap
Esempio n. 33
0
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)            # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
Esempio n. 34
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Esempio n. 35
0
 def collect_features(self, wav_path):
     fs, x = wavfile.read(wav_path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = P.trim_zeros_frames(spectrogram)
     if self.alpha is None:
         self.alpha = pysptk.util.mcepalpha(fs)
     mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha)
     # Drop 0-th coefficient
     mgc = mgc[:, 1:]
     # 50Hz cut-off MS smoothing
     hop_length = int(fs * (hp.frame_period * 0.001))
     modfs = fs / hop_length
     mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)
     # Add delta
     mgc = P.delta_features(mgc, hp.windows)
     return mgc.astype(np.float32)
Esempio n. 36
0
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
Esempio n. 37
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs