コード例 #1
0
ファイル: test_world.py プロジェクト: Iselix/kwiiyatta
def test_world_array_order():
    wav = kwiiyatta.load_wav(dataset.CLB_WAV)

    f0, timeaxis = pyworld.dio(wav.data, wav.fs)
    f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs)
    spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs)
    ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs)
    pyworld.synthesize(f0, spec, ape, wav.fs)

    data = wav.data[::2]

    expected_msg = 'ndarray is not C-contiguous'
    with pytest.raises(ValueError) as e:
        f0, timeaxis = pyworld.dio(data, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.cheaptrick(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.d4c(data, f0, timeaxis, wav.fs)
    assert expected_msg == str(e.value)

    with pytest.raises(ValueError) as e:
        pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs)
    assert expected_msg == str(e.value)
コード例 #2
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
コード例 #3
0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
コード例 #4
0
ファイル: wav2png.py プロジェクト: takiteke/UtaHenkan
def collect_features(x, fs):
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
コード例 #5
0
def analysis_resynthesis(signal):

    # 音響特徴量の抽出
    f0, t = pw.dio(signal, sample_rate)  # 基本周波数の抽出
    f0 = pw.stonemask(signal, f0, t, sample_rate)  # refinement
    sp = pw.cheaptrick(signal, f0, t, sample_rate)  # スペクトル包絡の抽出
    ap = pw.d4c(signal, f0, t, sample_rate)  # 非周期性指標の抽出

    # ピッチシフト
    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    # 再合成
    synth = pw.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return synth
コード例 #6
0
def analyze(wav,
            fs=FS,
            minf0=MINF0,
            maxf0=MAXF0,
            fperiod=SHIFTMS,
            fftl=FFTL,
            f0=None,
            time_axis=None):
    """
    f0 estimation w/o f0_floor & f0_ceil
    Args:
        minf0: Never used
        maxf0: Never used
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=60.0,
                                    frame_period=fperiod)
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
コード例 #7
0
def analyze_range(wav,
                  fs=FS,
                  minf0=MINF0,
                  maxf0=MAXF0,
                  fperiod=SHIFTMS,
                  fftl=FFTL,
                  f0=None,
                  time_axis=None):
    """
    f0 estimation w/ f0_floor & f0_ceil
    Args:
        f0: Given f0. If not provided, estimated by WORLD harvest/stonemask from waveform.
    Returns:
        (time_axis, fundamental frequency, spectral envelope, aperiodicity)
    """
    if f0 is None or time_axis is None:
        # pyworld.harvest: Estimate fo.
        _f0, time_axis = pw.harvest(wav,
                                    fs,
                                    f0_floor=minf0,
                                    f0_ceil=maxf0,
                                    frame_period=fperiod)
        # pyworld.stonemask: Refine fo.
        f0 = pw.stonemask(wav, _f0, time_axis, fs)
    # pyworld.cheaptrick: Spectral envelope estimation.
    sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl)
    # pyworld.d4c: Aperiodicity estimation.
    ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl)

    return time_axis, f0, sp, ap
コード例 #8
0
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3):
    """
    WAV音声データから話者情報を取り除いたWAV音声データを作成
    label音声からinput音声作成用
    :param path:
    :param f0Value:
    :param sp_strechRatio:
    :return:
    """
    waveNDArray = waveNDArray.astype(np.float)
    _f0, t = pw.dio(waveNDArray, fs)  # 基本周波数の抽出
    f0 = pw.stonemask(waveNDArray, _f0, t, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(waveNDArray, f0, t, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(waveNDArray, f0, t, fs)  # 非周期性指標の抽出
    f0_fixed0 = np.ones(f0.shape) * f0Value
    f0_median = np.median(f0)
    sp_median = np.median(sp)
    ap_median = np.median(ap)
    # SPを高周波方向に伸縮
    sp2 = np.ones_like(sp)*np.min(sp)
    for f in range(sp2.shape[1]):
        if(int(f / sp_strechRatio) >= sp.shape[1]): break
        sp2[:, f] = sp[:, int(f / sp_strechRatio)]
    # SP/APに正規分布ノイズ
    sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape)
    ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape)
    #ガウシアンフィルタ
    sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s)
    ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s)
    # 音声復元
    synthesized = pw.synthesize(f0_fixed0, sp, ap, fs)
    return synthesized
コード例 #9
0
ファイル: data_preprocess.py プロジェクト: tuanvu92/VCC2020
def extract_f0(wav_dir, speaker_id_pos=-4):
    wav_file_list = get_list_of_files(wav_dir)
    wav_file_list = [fname for fname in wav_file_list if file_filters(fname)]
    with open("jvs_speaker_info.json", "r") as f:
        speaker_info = json.load(f)
    for fname in progressbar(wav_file_list, redirect_stdout=True):
        print(fname)
        speaker_name = fname.split("/")[speaker_id_pos]
        x, fs = librosa.load(fname, sr=None)
        x = x.astype(np.float64)
        _f0, t = pyworld.dio(
            x,
            fs,
            # f0_floor=75, f0_ceil=400,
            f0_floor=speaker_info[speaker_name]["f0_min"],
            f0_ceil=speaker_info[speaker_name]["f0_max"],
            frame_period=12.5)
        f0 = pyworld.stonemask(x, _f0, t, fs)
        f0[f0 < 1.0] = 1.0
        f0 = np.log2(f0).astype(np.float32)
        fname = fname.replace("wav24", "f0_24k")
        fname = fname.replace(".wav", "")
        fname_tokens = fname.split('/')
        file_name = fname_tokens[-1]
        output_dir = "/".join(fname_tokens[:speaker_id_pos + 1])
        if not exists(output_dir):
            os.makedirs(output_dir)
        np.save(join(output_dir, file_name), f0)
    print("Finished!")
コード例 #10
0
def data_extraction(np_data, rate):
    np_data = np_data.astype(np.float)
    _f0, t = pw.harvest(np_data, rate)
    f0 = pw.stonemask(np_data, _f0, t, rate)
    sp = pw.cheaptrick(np_data, f0, t, rate)
    ap = pw.d4c(np_data, f0, t, rate)
    return f0, sp, ap
コード例 #11
0
 def collect_features(self, wav_path):
     
     # x: Raw audio, (Sample_length, )
     x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64)
     
     
     # f0: F0, (Frame_length, ) 
     # lf0: log(f0) --> interp1d (Frame_length, )
     # vuv: voice/unvoiced (Frame_length, )
     f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     lf0 = f0.copy()
     lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)])
     lf0 = interp1d(lf0, kind="slinear")
     vuv = (lf0 != 0).astype(np.float32)
     
     
     # spec: Spectrogram, (Frame_length x Dim), Dim = 513
     # bap: coded aperiodicity, (Frame_length, )
     # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60
     spec = pyworld.cheaptrick(x, f0, timeaxis, fs)
     aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)        
     bap = pyworld.code_aperiodicity(aperiodicity, fs)
     mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs))
     
     
     # Stacking Features: total dimesnion = 64
     features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec))
     return features.astype(np.float32)
コード例 #12
0
def wav2world(wavfile, frame_period):
    wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64)
    if hp.use_harvest:
        f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period)
    else:
        f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(wav, f0, timeaxis, fs)

    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs)
    bap = pyworld.code_aperiodicity(aperiodicity, fs)
    hp.num_bap = bap.shape[1]
    alpha = pysptk.util.mcepalpha(fs)
    mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha)
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    if hp.use_harvest:
        # https://github.com/mmorise/World/issues/35#issuecomment-306521887
        vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
    else:
        vuv = (lf0 != 0).astype(np.float32)
    #print(mgc.shape,lf0.shape,vuv.shape,bap.shape)
    features = np.hstack((mgc, lf0, vuv, bap))
    return features.astype(np.float32)
コード例 #13
0
def convert(signal):
    f0_rate = 2.4
    sp_rate = 0.78
    sample_rate = 16000

    f0, t = pyworld.dio(signal, sample_rate)
    f0 = pyworld.stonemask(signal, f0, t, sample_rate)
    sp = pyworld.cheaptrick(signal, f0, t, sample_rate)
    ap = pyworld.d4c(signal, f0, t, sample_rate)

    modified_f0 = f0_rate * f0

    # フォルマントシフト(周波数軸の一様な伸縮)
    modified_sp = np.zeros_like(sp)
    sp_range = int(modified_sp.shape[1] * sp_rate)
    for f in range(modified_sp.shape[1]):
        if (f < sp_range):
            if sp_rate >= 1.0:
                modified_sp[:, f] = sp[:, int(f / sp_rate)]
            else:
                modified_sp[:, f] = sp[:, int(sp_rate * f)]
        else:
            modified_sp[:, f] = sp[:, f]

    y = pyworld.synthesize(modified_f0, modified_sp, ap, sample_rate)

    return y
コード例 #14
0
ファイル: vocoder.py プロジェクト: xzm2004260/TTS-Cube
 def extract_spectrum(self, x, sample_rate):
     x = np.asarray(x)
     _f0, t = pw.dio(x, sample_rate, frame_period=12.5)  # raw pitch extractor
     f0 = pw.stonemask(x, _f0, t, sample_rate)  # pitch refinement
     sp = pw.cheaptrick(x, f0, t, sample_rate)  # extract smoothed spectrogram
     ap = pw.d4c(x, f0, t, sample_rate)
     return sp, ap, f0
コード例 #15
0
 def extract_f0(self, **kwargs):
     if self._f0 is None:
         self._f0, self._timeaxis = pyworld.dio(
             self.data, self.fs, frame_period=self.frame_period, **kwargs)
         self._f0 = pyworld.stonemask(self.data, self._f0, self._timeaxis,
                                      self.fs)
     return self._f0
コード例 #16
0
ファイル: 03_a_b_r.py プロジェクト: enamoria/exemplars_vc
def get_conversion_data(audiodata, fs, refine_f0):
    """
    Get A (without warping source dictionary) feature for conversion (sp, ap, f0)
    :param args:
    :param kwargs:
    :return: source dictionary (without warping)
    """
    features = []

    logging.info("Start building speaker A dictionary: Extracting feature for conversion (sp, ap, f0)")
    for audio in tqdm(audiodata):
        # Extract feature
        _f0, t = pw.dio(audio, fs)  # raw pitch extractor

        if refine_f0:
            f0 = pw.stonemask(audio, _f0, t, fs)  # pitch refinement
        else:
            f0 = _f0

        sp = pw.cheaptrick(audio, f0, t, fs)  # extract smoothed spectrogram
        ap = pw.d4c(audio, f0, t, fs)  # extract aperiodicity
        # y = pw.synthesize(f0, sp, ap, fs)

        features.append({
            'sp': sp,
            'ap': ap,
            'f0': f0,
            'fs': fs,
            'sr': fs
        })

    return features
コード例 #17
0
def wav2pw(wavfile, sr=SR, fft_size=FFT_SIZE, frame_period=FRAME_PERIOD):
    x, _ = librosa.load(wavfile, sr=sr, mono=True, dtype=np.float64)
    _f0, t = pw.harvest(x, sr, frame_period=frame_period)
    f0 = pw.stonemask(x, _f0, t, sr)
    sp = pw.cheaptrick(x, f0, t, sr, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, sr, fft_size=fft_size)
    return f0, sp, ap
コード例 #18
0
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0,
                 maxf0, type):
    wav, sr = load(filename, sr=None)

    # get f0
    x = wav.astype(float)
    _f0, t = world.harvest(x,
                           sr,
                           f0_floor=minf0,
                           f0_ceil=maxf0,
                           frame_period=winstep * 1000)
    f0 = world.stonemask(x, _f0, t, sr)

    window_size = int(sr * winlen)
    hop_size = int(sr * winstep)

    # get mel
    if type == 'mcc':
        spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0)
        h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T
    else:
        h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size)
    h = np.vstack((h, f0))
    maxlen = len(x) // hop_size + 2
    h = repeat_last_padding(h, maxlen)
    id = os.path.basename(filename).replace(".wav", "")
    return (id, x, h)
コード例 #19
0
    def compute_f0(self, x: np.ndarray) -> np.ndarray:
        """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram.

        Args:
            x (np.ndarray): Waveform.

        Returns:
            np.ndarray: Pitch.

        Examples:
            >>> WAV_FILE = filename = librosa.util.example_audio_file()
            >>> from TTS.config import BaseAudioConfig
            >>> from TTS.utils.audio import AudioProcessor
            >>> conf = BaseAudioConfig(pitch_fmax=8000)
            >>> ap = AudioProcessor(**conf)
            >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050]
            >>> pitch = ap.compute_f0(wav)
        """
        assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`."
        # align F0 length to the spectrogram length
        if len(x) % self.hop_length == 0:
            x = np.pad(x, (0, self.hop_length // 2), mode="reflect")

        f0, t = pw.dio(
            x.astype(np.double),
            fs=self.sample_rate,
            f0_ceil=self.pitch_fmax,
            frame_period=1000 * self.hop_length / self.sample_rate,
        )
        f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate)
        return f0
コード例 #20
0
def raw2WORLDfeatures(signal, fs=16000, fft_size=1024):
    _f0, t = pw.dio(signal, fs, f0_ceil=500)  # raw pitch contour extractor
    f0 = pw.stonemask(signal, _f0, t, fs)  # pitch refinement
    spectra = pw.cheaptrick(signal, f0, t, fs, fft_size=fft_size)
    aperiodicity = pw.d4c(signal, f0, t, fs,
                          fft_size=fft_size)  # extract aperiodicity
    return f0, spectra, aperiodicity
コード例 #21
0
def save_features_to_array(path = Data_Directory):
	labels, _, _ = get_labels(path)
	print(labels)
	for label in labels:
		fundfreq_vectors = []
		ap_vectors = []
		mfcc_vectors = []
		sp_vectors = []

		wavfiles = [path + label + '/' + wavfile for wavfile in sorted(os.listdir(path + '/' + label))]
		for wavfile in wavfiles:
			print(wavfile)
			x, fs = sf.read(wavfile)

			_f0, t = pw.dio(x, fs)
			f0 = pw.stonemask(x, _f0, t, fs)
			fundfreq_vectors.append(f0)

			sp = pw.cheaptrick(x, f0, t, fs)
			sp_vectors.append(sp)

			ap = pw.d4c(x, f0, t, fs)
			ap_vectors.append(ap)

			mfcc = wav2mfcc(wavfile, max_pad_len=120)
			mfcc_vectors.append(mfcc)
		#print(mfcc_vectors.shape)
		np.save('features/mfcc_' + label + '.npy', mfcc_vectors)
		np.save('features/fundfreq_' + label + '.npy', fundfreq_vectors)
コード例 #22
0
def get_para(data, fs):
    # This function is the same as wav2world.
    _fo, _time = pw.dio(data, fs)               # 基本周波数の抽出
    fo = pw.stonemask(data, _fo, _time, fs)     # 基本周波数の修正
    sp = pw.cheaptrick(data, fo, _time, fs)     # スペクトル包絡の抽出
    ap = pw.d4c(data, fo, _time, fs)            # 非周期性指標の抽出
    return fo, sp, ap
コード例 #23
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        mgc = pysptk.sp2mc(spectrogram,
                           order=order,
                           alpha=pysptk.util.mcepalpha(fs))
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        vuv = (lf0 != 0).astype(np.float32)
        lf0 = interp1d(lf0, kind="slinear")

        mgc = apply_delta_windows(mgc, windows)
        lf0 = apply_delta_windows(lf0, windows)
        bap = apply_delta_windows(bap, windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
コード例 #24
0
ファイル: predict.py プロジェクト: tkm2261/dnn-voice-changer
def generate_changed_voice(model, input_path):

    fs, x = wavfile.read(input_path)
    x = x.astype(np.float64)
    if len(x.shape) > 1:
        x = x.mean(axis=1)

    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]

    mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    gen_data = model.predict(mc)

    gen_data = np.hstack([c0.reshape((-1, 1)), gen_data])

    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    spectrogram = pysptk.mc2sp(
        gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen)
    waveform = pyworld.synthesize(
        f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform
コード例 #25
0
def wave2world(data):
    """
    Parameters
    ----------
    data : float64
        SamplingRate: 44100
        ValueRange  : [-1.0,1.0]
        Shape: (input_size)
    Returns
    -------
    _f0 : float64
        Shape: (N)
    _cepstrum : float64
        Shape: (N, 64)
    _aperiodicity : float64
        Shape: (N,513)
    NOTE: input_size is defined in config file.
          N is determined by input_size.
    """
    sampling_rate = 44100
    _f0, _t = pw.dio(data, sampling_rate, frame_period=10)
    _f0 = pw.stonemask(data, _f0, _t, sampling_rate)
    _cepstrum = pw.cheaptrick(data, _f0, _t, sampling_rate)
    _cepstrum = (np.log(_cepstrum) + 7) / 9
    _cepstrum = np.clip(_cepstrum, -1.0, 1.0)
    _aperiodicity = pw.d4c(data, _f0, _t, sampling_rate)
    return _f0, _cepstrum.astype(np.float32), _aperiodicity
コード例 #26
0
ファイル: acoustic_feature.py プロジェクト: BURI55/yukarin
    def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order,
                alpha, dtype):
        x = wave.wave.astype(numpy.float64)
        fs = wave.sampling_rate

        f0, t = pyworld.harvest(
            x,
            fs,
            frame_period=frame_period,
            f0_floor=f0_floor,
            f0_ceil=f0_ceil,
        )

        f0 = pyworld.stonemask(x, f0, t, fs)
        sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length)
        ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length)

        mc = pysptk.sp2mc(sp, order=order, alpha=alpha)
        coded_ap = pyworld.code_aperiodicity(ap, fs)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None],
            sp=sp,
            ap=ap,
            coded_ap=coded_ap,
            mc=mc,
            voiced=voiced[:, None],
        )
        feature = feature.astype_only_float(dtype)
        feature.validate()
        return feature
コード例 #27
0
def get_target(x,fs,n_ap_channels,n_sp_channels):
    _f0, t = pw.dio(x,fs, f0_floor=120.0, f0_ceil=750.0,
                    frame_period=8.0)
    f0_herz = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0_herz, t, fs)
    ap = pw.d4c(x, f0_herz, t, fs)
    # print(sp.shape)

    # plt.matshow(ap)
    # plt.show()
    ap=ap*20-18
    arr=[]
    for i in range(sp.shape[0]):
        arr.append(np.interp(np.linspace(0,1025,n_ap_channels),np.arange(1025),ap[i])[np.newaxis,:])
    _ap=np.concatenate(arr,axis=0)

    sp=np.log(sp)
    # plt.matshow(sp)
    # plt.show()
    arr=[]
    for i in range(sp.shape[0]):
        arr.append(np.interp(np.linspace(0,1025,n_sp_channels),np.arange(1025),sp[i])[np.newaxis,:])
    _sp=np.concatenate(arr,axis=0)

    
#     mel=mel+20.0
#     mel=np.where(mel>0,mel,0)
#     mel=mel/mel.max()
#     plt.matshow(mel)
#     plt.show()

    return _ap,_sp
コード例 #28
0
def pre_process(file_name, training_dir):

    audio_file_name = training_dir + file_name + '.wav'
    lyrics_file_name = training_dir + 'Transcripts/' + file_name + '.txt'

    audio_data, sample_rate = soundfile.read(audio_file_name)
    audio_data = librosa.resample(audio_data, sample_rate, params.sample_rate)
    sample_rate = params.sample_rate

    harvest_frequency, timing = pyworld.harvest(
        audio_data,
        sample_rate,
        f0_floor=params.min_freq,
        f0_ceil=params.max_freq,
        frame_period=params.frame_period)
    frequency = pyworld.stonemask(audio_data, harvest_frequency, timing,
                                  sample_rate)
    audio_length = len(frequency)

    phoneme_data = extract_phoneme_data(
        [audio_file_name, lyrics_file_name, audio_length])

    frequency_data = process_frequency(frequency)

    label_data = pd.concat([phoneme_data, frequency_data], axis=1)

    spectral_data, aperiodic_data = extract_timbre_data(
        [audio_data, frequency, timing, sample_rate])

    return [spectral_data, aperiodic_data, label_data, frequency]
コード例 #29
0
def wavfile2pw(filename, f0_ceil=F0_CEIL, fs=FS, fft_size=FFT_SIZE):
    """Speech analysis given the file name
  
  We use the PyWorld to extract feature, following the practice in:
  https://github.com/JeremyCCHsu/vae-npvc

  NOTE: The spectrum is normalized by energy and transformed to log scale. 
  To be discussed here 

  After transforming to the log scale, the spectrum will be further 
  normalized to be in the range of [-1, 1]
  
  Args:
    filename: the wav file 
    f0_ceil: maximum f0, note here we set the default to be 500, while praat 
      suggest we set 250. this will result in many small values in high frequence, probably not learnable for a network
    fs: sampling frequency, librosa will handle the frequency conversion
      from the original wavfile 
    fft_size: fft size

  Returns:
    f0: the pitch/ fundamental frequencys
    sp: spectogram
    ap: aperiodicity
    en: energy
  """
    x, _ = librosa.load(filename, sr=fs, mono=True, dtype=np.float64)
    _f0, t = pw.dio(x, fs, f0_ceil=f0_ceil)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)
    en = np.sum(sp + EPSILON, axis=1, keepdims=True)
    sp = np.log10(sp / en)
    return f0, sp, ap, en
コード例 #30
0
def process_wav(wav_path):
    y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000,
                    endian='LITTLE') #, start=56640, stop=262560)

    sr = 32000
    if osr != sr:
        y = librosa.resample(y, osr, sr)

    #使用harvest算法计算音频的基频F0
    _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period)
    _f0 = pw.stonemask(y, _f0, t, sr)
    print(_f0.shape)

    #使用CheapTrick算法计算音频的频谱包络
    _sp = pw.cheaptrick(y, _f0, t, sr)

    code_sp = code_harmonic(_sp, 60)
    print(_sp.shape, code_sp.shape)
    #计算aperiodic参数
    _ap = pw.d4c(y, _f0, t, sr)

    code_ap = pw.code_aperiodicity(_ap, sr)
    print(_ap.shape, code_ap.shape)

    return _f0, _sp, code_sp, _ap, code_ap
コード例 #31
0
def pyworld_featurize(audiofile):

    fs, x = wav.read(audiofile)
    print(x)
    print(fs)
    # corrects for 2 channel audio
    try:
        x = x[:, 0]
    except:
        pass
    x = np.array(np.ascontiguousarray(x), dtype=np.double)
    print(fs)
    print(x)

    _f0, t = pw.dio(x, fs)  # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity)

    features_0, labels_0 = stats(_f0, 'pitch')
    features_1, labels_1 = stats(_f0, 'pitch_refinement')
    features_2, labels_2 = stats(sp, 'smoothed_spectrogram')
    features_3, labels_3 = stats(ap, 'aperiodicity')

    features_0 = list(features_0)
    features_1 = list(features_1)
    features_2 = list(features_2)
    features_3 = list(features_3)

    features = features_0 + features_1 + features_2 + features_3
    labels = labels_0 + labels_1 + labels_2 + labels_3

    return features, labels
コード例 #32
0
ファイル: analyzer.py プロジェクト: QianQQ/Voice-Conversion
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)            # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
コード例 #33
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
コード例 #34
0
 def collect_features(self, wav_path):
     fs, x = wavfile.read(wav_path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = P.trim_zeros_frames(spectrogram)
     if self.alpha is None:
         self.alpha = pysptk.util.mcepalpha(fs)
     mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha)
     # Drop 0-th coefficient
     mgc = mgc[:, 1:]
     # 50Hz cut-off MS smoothing
     hop_length = int(fs * (hp.frame_period * 0.001))
     modfs = fs / hop_length
     mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)
     # Add delta
     mgc = P.delta_features(mgc, hp.windows)
     return mgc.astype(np.float32)
コード例 #35
0
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
コード例 #36
0
ファイル: evaluation_vc.py プロジェクト: shamanez/gantts
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs