Example #1
0
def test_world_array_order():
    wav = kwiiyatta.load_wav(dataset.CLB_WAV)

    f0, timeaxis = pyworld.dio(wav.data, wav.fs)
    f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs)
    spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs)
    ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs)
    pyworld.synthesize(f0, spec, ape, wav.fs)

    data = wav.data[::2]

    expected_msg = 'ndarray is not C-contiguous'
    with pytest.raises(ValueError) as e:
        f0, timeaxis = pyworld.dio(data, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.cheaptrick(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.d4c(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs)
    assert expected_msg == str(e.value)
Example #2
0
def estimate_word(word, name):
    if os.path.exists(f'rijeci_wav/{word}_{name}.wav'):
        f_bef, fs = sf.read(f'rijeci_wav/{word}_{name}.wav')

        f0, timeaxis = pw.harvest(f_bef, fs)
        f0_mask = pw.stonemask(f_bef, f0, timeaxis, fs)
        sp = pw.cheaptrick(f_bef, f0_mask, timeaxis, fs)
        ap = pw.d4c(f_bef, f0_mask, timeaxis, fs)
        y = pw.synthesize(f0_mask, sp, ap, fs, pw.default_frame_period)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-def.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-def.png',
                [f_bef, y], word)

        y = pw.synthesize(f0_mask, sp, ap, fs, 3.0)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}.png',
                [f_bef, y], word)

        y = pw.synthesize(f0_mask, sp, ap, fs, 20.0)

        sf.write(f'rijeci_after_sint/{word}_after_sint_{name}-20.wav', y, fs)
        savefig(f'slike_rijeci_after_sint/{word}_after_sint_{name}-20.png',
                [f_bef, y], word)
Example #3
0
def pw2wav(features, feat_dim=513, fs=16000):
    ''' NOTE: Use `order='C'` to ensure Cython compatibility '''
    #print(type(features['sp']))
    #print(type(features['en']))
    en = np.reshape(features['en'], [-1, 1])
    sp = np.power(10., features['sp'])
    sp = en * sp
    if isinstance(features, dict):
        return pw.synthesize(
            features['f0'].astype(np.float64).copy(order='C'),
            sp.astype(np.float64).copy(order='C'),
            features['ap'].astype(np.float64).copy(order='C'),
            fs,
        )
    features = features.astype(np.float64)
    sp = features[:, :feat_dim]
    ap = features[:, feat_dim:feat_dim*2]
    f0 = features[:, feat_dim*2]
    en = features[:, feat_dim*2 + 1]
    en = np.reshape(en, [-1, 1])
    sp = np.power(10., sp)
    sp = en * sp
    return pw.synthesize(
        f0.copy(order='C'),
        sp.copy(order='C'),
        ap.copy(order='C'),
        fs
    )
Example #4
0
def pw2wav(features, feat_dim=513, fs=16000):
    ''' NOTE: Use `order='C'` to ensure Cython compatibility '''
    en = np.reshape(features['en'], [-1, 1])
    sp = np.power(10., features['sp'])
    sp = en * sp
    if isinstance(features, dict):
        return pw.synthesize(
            features['f0'].astype(np.float64).copy(order='C'),
            sp.astype(np.float64).copy(order='C'),
            features['ap'].astype(np.float64).copy(order='C'),
            fs,
        )
    features = features.astype(np.float64)
    sp = features[:, :feat_dim]
    ap = features[:, feat_dim:feat_dim*2]
    f0 = features[:, feat_dim*2]
    en = features[:, feat_dim*2 + 1]
    en = np.reshape(en, [-1, 1])
    sp = np.power(10., sp)
    sp = en * sp
    return pw.synthesize(
        f0.copy(order='C'),
        sp.copy(order='C'),
        ap.copy(order='C'),
        fs
    )
Example #5
0
def word_synthesis(word, file_name):
    if os.path.exists(f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav'):
        data, samplerate = sf.read(
            f'{PROCESSED_WORDS_DIRECTORY}/{file_name}_{word}.wav')

        f0, timeaxis = pw.harvest(data, samplerate)
        f0_mask = pw.stonemask(data, f0, timeaxis, samplerate)
        spectral_envelop = pw.cheaptrick(data, f0_mask, timeaxis, samplerate)
        aperiodicity = pw.d4c(data, f0_mask, timeaxis, samplerate)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate,
                                         pw.default_frame_period)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.wav',
                 synthesized_word, samplerate)
        savefig(
            f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_default.png',
            [data, synthesized_word], word)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate, 3.0)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.wav',
                 synthesized_word, samplerate)
        savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_3.png',
                [data, synthesized_word], word)

        synthesized_word = pw.synthesize(f0_mask, spectral_envelop,
                                         aperiodicity, samplerate, 20.0)

        sf.write(f'{SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.wav',
                 synthesized_word, samplerate)
        savefig(f'{PLOTS_SYNTHESIS_WORDS_DIRECTORY}/{file_name}_{word}_20.png',
                [data, synthesized_word], word)
Example #6
0
def synthesis(resyn=False):
    results_dir = config.test_results_dir
    predicted_mceps = load_pkl(results_dir + '/predicted_mceps.pkl')
    predicted_mceps = data_merge(predicted_mceps)
    uttids = load_pkl(results_dir + '/test_uttids.pkl')
    lengths = load_pkl(results_dir + '/test_lengths.pkl')
    scp_dict = scp2dict()
    data_size = 10  # len(lengths)
    # get f0 range
    if 'bdl' in uttids[0]:
        f0_floor = 30.0
        f0_ceil = 300.0
    elif 'rms' in uttids[0]:
        f0_floor = 30.0
        f0_ceil = 300.0
    elif 'slt' in uttids[0]:
        f0_floor = 70.0
        f0_ceil = 500.0
    elif 'clb' in uttids[0]:
        f0_floor = 70.0
        f0_ceil = 500.0
    else:
        print('Unknown speaker! Check if something Wrong!!!')
        f0_floor = 40.0
        f0_ceil = 600.0
    src_spk = uttids[0].split('_')[0]
    tgt_spk = config.tgt_data_dir.split('/')[-1]
    for i in range(data_size):
        uttid = uttids[i]
        utt_len = lengths[i]
        sp_predict = mceps2sp(predicted_mceps[i][:utt_len, :])
        wav_arr, sr = librosa.load(scp_dict[uttid], sr=None, dtype=np.float64)
        _, t = pw.harvest(wav_arr, sr, f0_floor, f0_ceil)
        f0_raw = read_f0_via_id(uttid, utt_len)
        ap = pw.d4c(wav_arr, f0_raw, t, sr)
        if src_spk != tgt_spk:
            f0_t = f0_transform(f0_raw)
        else:
            f0_t = f0_raw
        y_predict = pw.synthesize(f0_t, sp_predict, ap[:utt_len, :], sr,
                                  pw.default_frame_period)
        y_predict = y_predict.astype(np.float32)
        librosa.output.write_wav(results_dir + '/' + uttid + '_predict.wav',
                                 y_predict, sr)
        if resyn:
            sp = pw.cheaptrick(wav_arr, f0_raw, t, sr)
            y_resyn = pw.synthesize(f0_raw, sp, ap, sr,
                                    pw.default_frame_period)
            y_resyn = y_resyn.astype(np.float32)
            librosa.output.write_wav(results_dir + '/' + uttid + '_resyn.wav',
                                     y_resyn, sr)
            print('Resynthesized %s groundtruth wav files!' % (i + 1))
        print('Synthesized %s wav files!' % (i + 1))
Example #7
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3):
    """
    WAV音声データから話者情報を取り除いたWAV音声データを作成
    label音声からinput音声作成用
    :param path:
    :param f0Value:
    :param sp_strechRatio:
    :return:
    """
    waveNDArray = waveNDArray.astype(np.float)
    _f0, t = pw.dio(waveNDArray, fs)  # 基本周波数の抽出
    f0 = pw.stonemask(waveNDArray, _f0, t, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(waveNDArray, f0, t, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(waveNDArray, f0, t, fs)  # 非周期性指標の抽出
    f0_fixed0 = np.ones(f0.shape) * f0Value
    f0_median = np.median(f0)
    sp_median = np.median(sp)
    ap_median = np.median(ap)
    # SPを高周波方向に伸縮
    sp2 = np.ones_like(sp)*np.min(sp)
    for f in range(sp2.shape[1]):
        if(int(f / sp_strechRatio) >= sp.shape[1]): break
        sp2[:, f] = sp[:, int(f / sp_strechRatio)]
    # SP/APに正規分布ノイズ
    sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape)
    ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape)
    #ガウシアンフィルタ
    sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s)
    ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s)
    # 音声復元
    synthesized = pw.synthesize(f0_fixed0, sp, ap, fs)
    return synthesized
Example #10
0
def worldSpeechSynthesis(f0: np.ndarray,
                         decoded_sp: np.ndarray,
                         ap: np.ndarray,
                         fs: int = SAMPLE_RATE,
                         frame_period: float = 5.) -> np.ndarray:
    '''
    worldでシンセサイズする

    Parameters
    ----------
    f0: np.ndarray
        フレームの基本周波数[hz]
    decoded_sp: np.ndarray
        スペクトル包絡
    ap: np.ndarray
        非周期性指標
    fs: int, default SAMPLE_RATE
        サンプリング周波数
    frame_period: float, default 5.
        フレームの時間的間隔
    
    Returns
    -------
    wave: np.ndarray
        合成した波形データ
    '''
    wave = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period)
    wave = wave.astype(np.float32)
    return wave
 def synthesis_proc(self):
     global audio_file_name, x, fs, _f0, _sp, _ap, _y, audio_out
     print(self.fund.get())
     print(self.form1.get())
     print(self.form2.get())
     print(self.form3.get())
     perc_inc=self.fund.get()
     new_f0 = _f0 + ((perc_inc/100) * _f0)
     fm,ft = cam_formants(x,fs)
     nos_of_peaks = self.form4.get()
     shiftconst_test = [self.form1.get(), self.form2.get(), self.form3.get()]
     shifted_sp = shift_formants(_sp, ft, fm, fs, nos_of_peaks, shiftconst_test)
     new_y = pw.synthesize(new_f0[0:len(_f0)-1], shifted_sp, _ap[0:len(_f0)-1], fs)
     audio_out='testaudio/'+'audio-out' + '_' + str(self.fund.get()) + '_' + str(self.form1.get()) + '_' + str(self.form2.get()) + '_' + str(self.form3.get())+'.wav'
     wav.write(audio_out,fs, new_y)
     wav.write('testaudio/origfile.wav',fs, x)
     print('done')
     plt.figure()
     plt.subplot(4,1,1)
     plt.title('Waveform')
     plt.plot(x)
     plt.plot(new_y)
     plt.subplot(4,1,2)
     plt.plot(_f0)
     plt.plot(new_f0)
     plt.subplot(4,1,3)
     plt.imshow(shifted_sp.transpose(), origin='lower', interpolation='none', aspect='auto', extent=(0, _sp.shape[0], 0, _sp.shape[1]))
     plt.subplot(4,1,4)
     plt.imshow(_ap.transpose(), origin='lower', interpolation='none', aspect='auto', extent=(0, _ap.shape[0], 0, _ap.shape[1]))
     plt.savefig(audio_out+'.png')
     print('done')
Example #12
0
    def generate(self, parm_var, do_postfilter=True):
        config = self.analysis_config

        for path in self.paths:
            file_id = splitext(basename(path))[0]
            print('Synthesizing %s ... ' % (file_id), end='')
            mgc, lf0, vuv, bap = self._generate_parameters(path, parm_var)

            if do_postfilter:
                mgc = merlin_post_filter(mgc, config.alpha)

            sp = pysptk.mc2sp(mgc,
                              fftlen=config.fft_length,
                              alpha=config.alpha)
            ap = pyworld.decode_aperiodicity(bap.astype(np.float64),
                                             config.sampling_rate,
                                             config.fft_length)
            f0 = self._lf0_to_f0(lf0, vuv)
            generated = pyworld.synthesize(f0.flatten().astype(np.float64),
                                           sp.astype(np.float64),
                                           ap.astype(np.float64),
                                           config.sampling_rate,
                                           config.frame_period)
            with open(join(self.out_dir, file_id + '.wav'), 'wb') as f:
                f.write(Audio(generated, rate=config.sampling_rate).data)
            print('done!')
def analysis_resynthesis(signal):

    # 音響特徴量の抽出
    f0, t = pw.dio(signal, sample_rate)  # 基本周波数の抽出
    f0 = pw.stonemask(signal, f0, t, sample_rate)  # refinement
    sp = pw.cheaptrick(signal, f0, t, sample_rate)  # スペクトル包絡の抽出
    ap = pw.d4c(signal, f0, t, sample_rate)  # 非周期性指標の抽出

    # ピッチシフト
    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    # 再合成
    synth = pw.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return synth
Example #14
0
def synthesis():
    # pdb.set_trace()
    lf0_file = "p225_001.lf0"
    bap_file_name="p225_001.bap"
    mgc_file_name="p225_001.mgc"
    fl=4096
    sr=48000
    # pdb.set_trace()
    lf0 = read_binfile(lf0_file, dim=1, dtype=np.float32)
    zeros_index = np.where(lf0 == -1E+10)
    nonzeros_index = np.where(lf0 != -1E+10)
    f0 = lf0.copy()
    f0[zeros_index] = 0
    f0[nonzeros_index] = np.exp(lf0[nonzeros_index])
    f0 = f0.astype(np.float64)
    bap_dim = 5
    bap = read_binfile(bap_file_name, dim=bap_dim, dtype=np.float32)
    ap = pyworld.decode_aperiodicity(bap.astype(np.float64).reshape(-1, bap_dim), sr, fl)
    mc = read_binfile(mgc_file_name, dim=60, dtype=np.float32)
    alpha = pysptk.util.mcepalpha(sr)
    sp = pysptk.mc2sp(mc.astype(np.float64), fftlen=fl, alpha=alpha)
    wav = pyworld.synthesize(f0, sp, ap, sr, 5)
    x2 = wav * 32768
    x2 = x2.astype(np.int16)
    scipy.io.wavfile.write("resynthesis.wav", sr, x2)
Example #15
0
    def analysis_for_valid_batch(self,
                                 features,
                                 output_features,
                                 names,
                                 out_dir,
                                 sample_rate=16000,
                                 **kwargs):
        super(F0Model,
              self).analysis_for_valid_batch(features, output_features, names,
                                             out_dir, **kwargs)

        # Synthesise outputs using WORLD.
        synth_dir = os.path.join(out_dir, 'synth')
        os.makedirs(synth_dir, exist_ok=True)

        lf0 = output_features['lf0'].cpu().detach().numpy()

        vuv = features['vuv'].cpu().detach().numpy()
        sp = features['sp'].cpu().detach().numpy()
        ap = features['ap'].cpu().detach().numpy()

        n_frames = features['n_frames'].cpu().detach().numpy()
        for i, (n_frame, name) in enumerate(zip(n_frames, names)):

            f0_i = np.exp(lf0[i, :n_frame, 0])
            f0_i = savgol_filter(f0_i, 7, 1)
            f0_i = f0_i * vuv[i, :n_frame, 0]

            f0_i = f0_i.astype(np.float64)
            sp_i = sp[i, :n_frame].astype(np.float64)
            ap_i = ap[i, :n_frame].astype(np.float64)

            wav_path = os.path.join(synth_dir, '{}.wav'.format(name))
            wav = pyworld.synthesize(f0_i, sp_i, ap_i, sample_rate)
            tdt.file_io.save_wav(wav_path, wav, sample_rate=sample_rate)
Example #16
0
    def Conversion(self):
        print(f">Conversion")
        print(f"f0_rate:{self.f0_rate}, sp_rate:{self.sp_rate}")
        self.statusBar().showMessage(f'Start conversion')
        wavdata = self.streamer.get_all()
        if (len(self.wav) <= 0):
            reply = QMessageBox.information(
                self, "声変換",
                "変換前の音声データがありません\nStartボタンを押して録音するか\nファイル>変換前の音声データの読み込み から音声データを読み込んでください"
            )
            return
        self.saveconvAction.setEnabled(True)
        wavdata = np.frombuffer(wavdata, dtype='int16').astype(np.float64)
        f0, t = pw.harvest(wavdata, self.RATE)  # 基本周波数の抽出
        sp = pw.cheaptrick(wavdata, f0, t, self.RATE)  # スペクトル包絡の抽出
        ap = pw.d4c(wavdata, f0, t, self.RATE)  # 非周期性指標の抽出

        "ピッチシフト"
        modified_f0 = self.f0_rate * f0
        "フォルマントシフト(周波数軸の一様な伸縮)"
        modified_sp = np.zeros_like(sp)
        sp_range = int(modified_sp.shape[1] * self.sp_rate)
        for f in range(modified_sp.shape[1]):
            if (f < sp_range):
                if self.sp_rate >= 1.0:
                    modified_sp[:, f] = sp[:, int(f / self.sp_rate)]
                else:
                    modified_sp[:, f] = sp[:, int(self.sp_rate * f)]
            else:
                modified_sp[:, f] = sp[:, f]

        self.synth = pw.synthesize(modified_f0, modified_sp, ap, self.RATE)
        self.curve2.setData(self.synth / 32767.0)
        print(len(self.synth))
        self.statusBar().showMessage(f'Finish conversion')
Example #17
0
def generate_test(filename):

    [sp_min, sp_max, ap_min,
     ap_max] = np.load('data/timbre_model/min_max_record.npy')
    condi = get_condition(filename)
    # cat_input = get_ap_cat()
    # fist_input = get_first_input()

    sp, raw_sp = generate_timbre(0, sp_max, sp_min, condi, None)

    plt.imshow(np.log(np.transpose(sp)),
               aspect='auto',
               origin='bottom',
               interpolation='none')
    plt.show()

    sp1 = load_timbre('data/timbre_model/test/sp/' + filename + '_sp.npy', 0,
                      sp_max, sp_min)

    plt.imshow(np.log(np.transpose(sp1)),
               aspect='auto',
               origin='bottom',
               interpolation='none')
    plt.show()
    ####################################################################################################
    ap, raw_ap = generate_timbre(1, ap_max, ap_min, condi, raw_sp)

    plt.imshow(np.log(np.transpose(ap)),
               aspect='auto',
               origin='bottom',
               interpolation='none')
    plt.show()

    ap1 = load_timbre('data/timbre_model/test/ap/' + filename + '_ap.npy', 1,
                      ap_max, ap_min)

    plt.imshow(np.log(np.transpose(ap1)),
               aspect='auto',
               origin='bottom',
               interpolation='none')
    plt.show()

    #########################################################################################################
    # vuv_cat = get_vuv_cat()
    # gen_cat = torch.cat((raw_ap, raw_sp), 0)

    # vuv = generate_vuv(condi, vuv_cat)
    # plt.plot(vuv)
    # plt.show()
    #
    # vuv1 = np.load('data/timbre_model/test/vuv/nitech_jp_song070_f001_029_vuv.npy')
    # plt.plot(vuv1)
    # plt.show()

    path = 'data/raw/' + filename + '.raw'
    _f0, _sp, code_sp, _ap, code_ap = process_wav(path)
    # 合成原始语音
    synthesized = pw.synthesize(_f0, sp, ap, 32000, pw.default_frame_period)
    # 1.输出原始语音
    sf.write('./data/gen_wav/' + filename + '' '.wav', synthesized, 32000)
def save_world_wav(feats, model_name, filename):

    # feats = [f0, sp, ap, sp_coded, labels]

    if isinstance(feats[3], torch.Tensor):
        feats[3] = feats[3].cpu().numpy()
    if hp.normalise_mels:
        feats[3] = _unnormalise_coded_sp(feats[3])

    path = os.path.join(hp.sample_set_dir, model_name)

    if not os.path.exists(path):
        os.makedirs(path)

    path = os.path.join(path, filename)

    # print("Made path.")
    feats[3] = np.ascontiguousarray(feats[3], dtype=np.float64)
    # print("Made contiguous.")
    # print(feats[3].shape)
    decoded_sp = decode_spectral_envelope(feats[3], hp.sr, fft_size=hp.n_fft)
    # print("Decoded.")
    # f0_converted = norm.pitch_conversion(f0, speaker, target)
    wav = synthesize(feats[0], decoded_sp, feats[1], hp.sr)
    # Audio(wav,rate=hp.sr)
    # librosa.display.waveplot(y=wav, sr=hp.sr)
    # print("Sythesized wav.")
    save_wav(wav, path)
Example #19
0
    def __getitem__(self, key):
        key, pitch_aug_factor, time_aug_factor = key
        wav = self.data[key]
        if self.normalize:
            # soundfile.read normalizes data to [-1,1] if dtype is not given
            array, rate = soundfile.read(wav, always_2d=self.always_2d)
        else:
            array, rate = soundfile.read(wav,
                                         dtype=self.dtype,
                                         always_2d=self.always_2d)

        if pitch_aug_factor != 0:
            # Pitch augmentation
            ratio = pow(2, 1 / 12)
            import pyworld as pw

            f0_pw, sp, ap = pw.wav2world(array, rate)  # use default options
            array = pw.synthesize(
                f0_pw * (ratio**pitch_aug_factor),
                sp,
                ap,
                rate,
                pw.default_frame_period,
            )

        if time_aug_factor != 1:
            # Time augmentation
            array = tsm.wsola(array, time_aug_factor)

        return rate, array
Example #20
0
def convert(signal):
    f0_rate = 2.4
    sp_rate = 0.78
    sample_rate = 16000

    f0, t = pyworld.dio(signal, sample_rate)
    f0 = pyworld.stonemask(signal, f0, t, sample_rate)
    sp = pyworld.cheaptrick(signal, f0, t, sample_rate)
    ap = pyworld.d4c(signal, f0, t, sample_rate)

    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    y = pyworld.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return y
Example #21
0
def world2wav(feature, frame_period):
    hparams = hp
    mgc_idx = 0
    lf0_idx = mgc_idx + hparams.num_mgc
    vuv_idx = lf0_idx + hparams.num_lf0
    bap_idx = vuv_idx + hparams.num_vuv

    mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc]
    lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0]
    vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv]
    bap = feature[:, bap_idx:bap_idx + hparams.num_bap]

    fs = hparams.sample_rate
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = pyworld.get_cheaptrick_fft_size(fs)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    indexes = (vuv < 0.5).flatten()
    bap[indexes] = np.zeros(hparams.num_bap)

    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    return pyworld.synthesize(f0.flatten().astype(np.float64),
                              spectrogram.astype(np.float64),
                              aperiodicity.astype(np.float64), fs,
                              frame_period)
Example #22
0
def world_speech_synthesis(f0, decoded_sp, ap, fs, frame_period):
    # decoded_sp = decoded_sp.astype(np.float64)
    wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period)
    # Librosa could not save wav if not doing so
    wav = wav.astype(np.float32)

    return wav
Example #23
0
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4,
                 fs=16000, mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
    def formant(self, val, f0_v):
        '''
            Change formant.
            val : formant rate
            f0_v: f0 rate
        '''
        f_rate = self.audio.frame_rate
        np_arr = np.array(self.audio.get_array_of_samples(),
                          dtype=np.float64)  # pydub --> np.array(float64) 変換
        # print(np_arr, f_rate)
        _f0_val, _time = pyworld.dio(np_arr, f_rate)  # 基本周波数
        spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate)  # スペクトル包絡
        aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate)  # 非周期性指標
        spct_b = np.zeros_like(spct)
        for i in range(spct_b.shape[1]):
            spct_b[:, i] = spct[:, int(i / val)]
        ef_audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate)
        ef_audio = ef_audio.astype(np.int16).tobytes()

        # print(ef_audio)
        # print(type(ef_audio))
        new_audio = AudioSegment(
            ef_audio,
            sample_width=self.audio.sample_width,
            frame_rate=f_rate,
            channels=self.audio.channels,
        )
        self.audio = new_audio
        return self
Example #25
0
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
Example #26
0
def gen_waveform(y_predicted,
                 Y_mean,
                 Y_std,
                 post_filter=False,
                 coef=1.4,
                 fs=16000,
                 mge_training=True):
    alpha = pysptk.util.mcepalpha(fs)
    fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs)
    frame_period = hp_acoustic.frame_period

    # Generate parameters and split streams
    mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std,
                                        mge_training)

    if post_filter:
        mgc = merlin_post_filter(mgc, alpha, coef=coef)

    spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha)
    aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs,
                                               fftlen)
    f0 = lf0.copy()
    f0[vuv < 0.5] = 0
    f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

    generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64),
                                            spectrogram.astype(np.float64),
                                            aperiodicity.astype(np.float64),
                                            fs, frame_period)
    # Convert range to int16
    generated_waveform = generated_waveform / \
        np.max(np.abs(generated_waveform)) * 32767

    # return features as well to compare natural/genearted later
    return generated_waveform, mgc, lf0, vuv, bap
Example #27
0
def synthesis(ori_path, aim_sp, aim_spkid):
    print('synthesizing ...')
    wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64)
    f0, timeaxis = pw.harvest(wav, hp.SR)
    sp_per_timeaxis_before = pw.cheaptrick(wav,
                                           f0,
                                           timeaxis,
                                           hp.SR,
                                           fft_size=hp.N_FFT)  # 1024 压缩到 513 维

    # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT)

    # print('f0.shape = ')
    # print(f0)

    ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT)
    aim_decoded_sp = pw.decode_spectral_envelope(
        aim_sp, hp.SR, fft_size=hp.N_FFT)  # 转换/解码 后的sp:
    print('解码后的513维度的aim_decoded_sp = ')
    print(aim_decoded_sp.shape)
    print(aim_decoded_sp[399][:])

    synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR)
    print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav')
    librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav',
                             synwav,
                             sr=hp.SR)
Example #28
0
def feats_to_audio_test(in_feats,filename, fs=config.fs,  mode=config.comp_mode):
    harm = in_feats[:,:60]
    ap = in_feats[:,60:-2]
    f0 = in_feats[:,-2:]
    f0[:,0] = f0[:,0]-69
    f0[:,0] = f0[:,0]/12
    f0[:,0] = 2**f0[:,0]
    f0[:,0] = f0[:,0]*440


    f0 = f0[:,0]*(1-f0[:,1])


    if mode == 'mfsc':
        harm = mfsc_to_mgc(harm)
        ap = mfsc_to_mgc(ap)


    harm = mgc_to_sp(harm, 1025, 0.45)
    ap = mgc_to_sp(ap, 1025, 0.45)

    harm = 10**(harm/10)
    ap = 10**(ap/20)

    y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime)
    sf.write('./medley_resynth_test/'+filename+'.wav',y,fs)
Example #29
0
def looptoweb(f1perc, f2perc):
    #table is the name of the output html
    table_file = open('Shiftmethod1.html', 'w')
    table_file.write('<!DOCTYPE html><html>' + styletext +
                     '<body><h1>Audio Files ' + path +
                     '</h1><table><tr><th>Form1/Form2</th>')
    for k in np.arange(0, len(f2perc)):
        table_file.write('<th>' + str(f2perc[k]) + '%' + '</th>')
    table_file.write('</tr>')
    for i in np.arange(0, len(f1perc)):
        table_file.write('<tr>')
        table_file.write('<td>' + str(f1perc[i]) + '%' + '</td>')
        for j in np.arange(0, len(f2perc)):
            audio_out = 'testaudio/' + 'testsmile' + '_' + str(
                f1perc[i]) + '_' + str(f2perc[j]) + '.wav'
            shifted_sp, maximas = shift_formants(sp, ft, fm, fs, 2,
                                                 [f1perc[i], f2perc[j]])
            #print('-------------------------'+[f1perc[i],f2perc[j]]+'----------------------------------------')
            new_y = pw.synthesize(f0[0:len(f0) - 1], shifted_sp,
                                  ap[0:len(f0) - 1], fs)
            wav.write(audio_out, fs, new_y)
            table_file.write('<td><audio controls>')
            #table_file.write(audio_out)
            table_file.write('<source src= ' + '"' + audio_out + '"' +
                             ' type="audio/mpeg">')
            table_file.write('</audio></td>')
        table_file.write('</tr>')
    table_file.write('</table></body></html>')
    table_file.close()
Example #30
0
def feats_to_audio(in_feats,filename, fs=config.fs,  mode=config.comp_mode):
    harm = in_feats[:,:60]
    ap = in_feats[:,60:-2]
    f0 = in_feats[:,-2:]
    # f0[:,0] = f0[:,0]-69
    # f0[:,0] = f0[:,0]/12
    # f0[:,0] = 2**f0[:,0]
    # f0[:,0] = f0[:,0]*440
    f0[:,0] = f0_to_hertz(f0[:,0])

    f0 = f0[:,0]*(1-f0[:,1])


    if mode == 'mfsc':
        harm = mfsc_to_mgc(harm)
        ap = mfsc_to_mgc(ap)


    harm = mgc_to_sp(harm, 1025, 0.45)
    ap = mgc_to_sp(ap, 1025, 0.45)

    harm = 10**(harm/10)
    ap = 10**(ap/20)


    y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime*1000)
    sf.write(config.val_dir+filename+'.wav',y,int(fs))
def feats_to_audio(in_feats,fs=config.fs):
    harm = in_feats[:,:60]
    ap = in_feats[:,60:-2]
    f0 = in_feats[:,-2:]
    f0[:,0] = f0_to_hertz(f0[:,0])

    f0 = f0[:,0]*(1-f0[:,1])


    # # if mode == 'mfsc':
    # harm = mfsc_to_mgc(harm)

    wraped_freq = get_warped_freqs(60, config.fs, 0.45)
    # import pdb;pdb.set_trace()
    harm = mfsc_to_sp(harm, wraped_freq, 1025, config.fs)
    # ap = mfsc_to_mgc(ap)


    # harm = mgc_to_sp(harm, 1025, 0.4)
    ap = wbap_to_ap(ap, 1025, config.fs)

    harm = np.ascontiguousarray(10**((harm - config.world_offset)/10))
    ap = np.ascontiguousarray(10**(ap/20))


    y=pw.synthesize(f0.astype('double'),harm.astype('double'),ap.astype('double'),fs,config.hoptime*1000)
    return y
    def synthesis(self, feat, se_kind='sp'):
        batch_size = feat['ap'].size(0)
        device = feat['ap'].device

        audio = []
        for i in range(batch_size):
            ap = feat['ap'][i].detach().t().cpu().double().numpy()
            f0 = feat['f0'][i].detach().view(-1).cpu().double().numpy()
            if se_kind == 'mcc':
                mcc = feat['mcc'][i].detach().t().cpu().double().numpy()
                sp = pysptk.mc2sp(mcc.copy(order='C'), self.mcc_alpha,
                                  self.fft_size)
            else:
                sp = feat['sp'][i].detach().t().cpu().double().numpy()

            syn = pyworld.synthesize(f0.copy(order='C'),
                                     sp.copy(order='C'),
                                     ap.copy(order='C'),
                                     self.fs,
                                     frame_period=self.shiftms)
            audio.append(torch.from_numpy(syn).float().view(-1))

        audio = torch.cat([syn.unsqueeze(0) for syn in audio],
                          dim=0).to(device)

        return audio / MAX_WAV_VALUE
    def gen_waveform(self, feature):
        mcep_dim = self.config['mcep_order'] + 1
        mgc = feature[:, :mcep_dim]
        lf0 = feature[:, mcep_dim:mcep_dim + 1]
        vuv = feature[:, mcep_dim + 1: mcep_dim + 2]
        bap = feature[:, mcep_dim + 2:]

        spectrogram = pysptk.mc2sp(
            mgc,
            fftlen=self.config['fft_size'],
            alpha=pysptk.util.mcepalpha(self.config['sampling_rate']),
        )
        aperiodicity = pyworld.decode_aperiodicity(
            bap.astype(np.float64),
            self.config['sampling_rate'],
            self.config['fft_size'],
        )
        f0 = lf0.copy()
        f0[vuv < 0.5] = 0
        f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)])

        waveform = pyworld.synthesize(
            f0.flatten().astype(np.float64),
            spectrogram.astype(np.float64),
            aperiodicity.astype(np.float64),
            self.config['sampling_rate'],
            self.config['hop_size_in_ms'],
        )
        return waveform
def world_speech_synthesis(f0, decoded_sp, ap, fs, frame_period):

    #decoded_sp = decoded_sp.astype(np.float64)
    wav = pyworld.synthesize(f0, decoded_sp, ap, fs, frame_period)
    # Librosa could not save wav if not doing so
    wav = wav.astype(np.float32)

    return wav
Example #35
0
 def decode(
         self,
         acoustic_feature: AcousticFeature,
 ):
     acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
     out = pyworld.synthesize(
         f0=acoustic_feature.f0.ravel(),
         spectrogram=acoustic_feature.spectrogram,
         aperiodicity=acoustic_feature.aperiodicity,
         fs=self.out_sampling_rate,
         frame_period=self.acoustic_feature_param.frame_period
     )
     return Wave(out, sampling_rate=self.out_sampling_rate)
    def convert_from_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None):
        if out_sampling_rate is None:
            out_sampling_rate = self.config.dataset.param.voice_param.sample_rate

        out = self.convert_to_feature(input=input, out_sampling_rate=out_sampling_rate)
        out = pyworld.synthesize(
            f0=out.f0.ravel(),
            spectrogram=out.spectrogram,
            aperiodicity=out.aperiodicity,
            fs=out_sampling_rate,
            frame_period=self._param.acoustic_feature_param.frame_period,
        )
        return Wave(out, sampling_rate=out_sampling_rate)
 def convert_to_audio(
         self,
         input: numpy.ndarray,
         acoustic_feature: AcousticFeature,
         sampling_rate: int,
 ):
     acoustic_feature = acoustic_feature.astype_only_float(numpy.float64)
     out = pyworld.synthesize(
         f0=acoustic_feature.f0.ravel(),
         spectrogram=input.astype(numpy.float64),
         aperiodicity=acoustic_feature.aperiodicity,
         fs=sampling_rate,
         frame_period=self._param.acoustic_feature_param.frame_period,
     )
     return Wave(out, sampling_rate=sampling_rate)
Example #38
0
    def synthesis(self, f0, mcep, ap, rmcep=None, alpha=0.42):
        """synthesis generates waveform from F0, mcep, aperiodicity

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
            array of F0 sequence
        mcep : array, shape (`T`, `dim`)
            array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftlen / 2 + 1`) or (`T`, `dim_codeap`)
            array of aperiodicity or code aperiodicity
        rmcep : array, optional, shape (`T`, `dim`)
            array of reference mel-cepstrum sequence
            Default set to None
        alpha : int, optional
            Parameter of all-path transfer function
            Default set to 0.42

        Returns
        ----------
        wav: array,
            Synethesized waveform

        """

        if rmcep is not None:
            # power modification
            mcep = mod_power(mcep, rmcep, alpha=alpha)

        if ap.shape[1] < self.fftl // 2 + 1:
            # decode codeap to ap
            ap = pyworld.decode_aperiodicity(ap, self.fs, self.fftl)

        # mcep into spc
        spc = pysptk.mc2sp(mcep, alpha, self.fftl)

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0, spc, ap,
                                 self.fs, frame_period=self.shiftms)

        return wav
Example #39
0
    def synthesis_spc(self, f0, spc, ap):
        """synthesis generates waveform from F0, mcep, ap

        Parameters
        ----------
        f0 : array, shape (`T`, `1`)
          array of F0 sequence
        spc : array, shape (`T`, `fftl // 2 + 1`)
          array of mel-cepstrum sequence
        ap : array, shape (`T`, `fftl // 2 + 1`)
          array of aperiodicity

        Return
        ------
        wav: vector, shape (`samples`)
          Synethesized waveform

        """

        # generate waveform using world vocoder with f0, spc, ap
        wav = pyworld.synthesize(f0, spc, ap,
                                 self.fs, frame_period=self.shiftms)

        return wav
Example #40
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs