Esempio n. 1
0
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)  # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size)  # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
Esempio n. 2
0
def wav2pw(x, fs=16000, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil)            # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
Esempio n. 3
0
def reconstruct_waveform(signal, spectrogram, aperiodicity, fs=16000, frame_period=5):
	"""
		Reconstructs the waveform from the spectrogram and MFCCs
	"""

	if type(signal) != 'numpy.float64':
		signal = np.float64(signal)

	f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period)
	f0 = pyworld.stonemask(signal, f0, timeaxis, fs)

	return pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period)
Esempio n. 4
0
 def forward(self, x):
     # Compute the hop length
     hop = int(1000 * self.block_size / self.sr)
     if self.method == "dio":
         f0 = dio(x.astype(np.float64),
                  self.sr,
                  frame_period=hop,
                  f0_floor=50,
                  f0_ceil=2000)[0]
     elif self.method == "crepe":
         f0 = crepe.predict(x, self.sr, step_size=hop, verbose=False)[1]
     return f0.astype(np.float)
Esempio n. 5
0
def wav2pw(x, fs, fft_size=FFT_SIZE):
    ''' Extract WORLD feature from waveform '''
    _f0, t = pw.dio(x, fs, f0_ceil=700, f0_floor=71.0, frame_period=FRMAE_PERIOD)            # raw pitch extractor
    #f0, t = pw.harvest(x, fs, f0_ceil=700, f0_floor=71.0, frame_period=FRMAE_PERIOD)
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size)
    ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity
    return {
        'f0': f0,
        'sp': sp,
        'ap': ap,
    }
Esempio n. 6
0
def TestWORLD(waveform, sr, *args):
    """
    Test performance of WORLD (pyworld)
    """
    # extract features
    _f0, t = pw.dio(waveform, sr)  # raw pitch extractor
    f0 = pw.stonemask(waveform, _f0, t, sr)  # pitch refinement
    sp = pw.cheaptrick(waveform, f0, t, sr)  # extract smoothed spectrogram
    ap = pw.d4c(waveform, f0, t, sr)  # extract aperiodicity
    # synthesize waveform
    generated_waveform = pw.synthesize(f0, sp, ap, sr)
    return generated_waveform
Esempio n. 7
0
 def audio_synthesize(self, y):
     y = np.asarray(y, dtype=np.float)
     before_shape = y.shape
     # print("y_after",y.shape)
     _f0, t = pw.dio(y, FS)  # raw pitch extractor
     f0 = pw.stonemask(y, _f0, t, FS)  # pitch refinement
     sp = pw.cheaptrick(y, f0, t, FS)  # extract smoothed spectrogram
     ap = pw.d4c(y, f0, t, FS)  # extract aperiodicity
     data = pw.synthesize(f0 * self.f0_parameter, sp, ap, FS)
     y_new = data[0:before_shape[0]]
     y_new = np.asarray(y_new, dtype=np.int16)
     return y_new
Esempio n. 8
0
 def PitchAnalyze(self):
     
     # fs : sampling frequency, 音楽業界では44,100Hz
     # data : arrayの音声データが入る 
     fs, data = wavfile.read(self.wav_file)
     
     # floatでないとworldは扱えない
     data = data.astype(np.float)
     _f0, _time = pw.dio(data, fs)    # 基本周波数の抽出
     f0 = pw.stonemask(data, _f0, _time, fs)  # 基本周波数の修正
     self.f0 = f0
     return(f0)
Esempio n. 9
0
def wav2mcep(WAV_FILE,dim):
    fs, data = wavfile.read(WAV_FILE)

    # floatでないとworldは扱えない
    data = data.astype(np.float)

    _f0, _time = pw.dio(data, fs)    # 基本周波数の抽出。pw.dioは0.005秒ごとの基本周波数を測定し、numpyとして返す。
    f0 = pw.stonemask(data, _f0, _time, fs)  # 基本周波数の修正
    sp = pw.cheaptrick(data, f0, _time, fs)  # スペクトル包絡の抽出
    ap = pw.d4c(data, f0, _time, fs)         # 非周期性指標の抽出
    mcep=pysptk.sp2mc(sp,dim,0.42)
    return torch.Tensor(mcep)
Esempio n. 10
0
def process_utterance(in_dir, out_dir, basename):
    wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename))
    tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename))

    # Get alignments
    textgrid = tgt.io.read_textgrid(tg_path)
    phone, duration, start, end = get_alignment(
        textgrid.get_tier_by_name('phones'))
    # '{A}{B}{$}{C}', $ represents silent phones
    text = '{' + '}{'.join(phone) + '}'
    text = text.replace('{$}', ' ')    # '{A}{B} {C}'
    text = text.replace('}{', ' ')     # '{A B} {C}'

    if start >= end:
        return None

    # Read and trim wav files
    _, wav = read(wav_path)
    wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32)

    # Compute fundamental frequency
    f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate,
                   frame_period=hp.hop_length/hp.sampling_rate*1000)
    f0 = f0[:sum(duration)]

    # Compute mel-scale spectrogram and energy
    mel_spectrogram, energy = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav))
    mel_spectrogram = mel_spectrogram.numpy().astype(np.float32)[:, :sum(duration)]
    energy = energy.numpy().astype(np.float32)[:sum(duration)]
    if mel_spectrogram.shape[1] >= hp.max_seq_len:
        return None

    # Save alignment
    ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'alignment', ali_filename),
            duration, allow_pickle=False)

    # Save fundamental prequency
    f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False)

    # Save energy
    energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'energy', energy_filename),
            energy, allow_pickle=False)

    # Save spectrogram
    mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename)
    np.save(os.path.join(out_dir, 'mel', mel_filename),
            mel_spectrogram.T, allow_pickle=False)

    return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
Esempio n. 11
0
def get_aperiodicity(signal, fs=16000, frame_period=5):
	"""
		Extract aperiodicity of a signal
	"""

	if type(signal) != 'numpy.float64':
		signal = np.float64(signal)

	f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period)
	f0 = pyworld.stonemask(signal, f0, timeaxis, fs)
	aperiodicity = pyworld.d4c(signal, f0, timeaxis, fs)

	return aperiodicity
Esempio n. 12
0
def get_spectrogram(signal, fs=16000, frame_period=5):
	"""
		Extracts spectrogram from signal
	"""

	if type(signal) != 'numpy.float64':
		signal = np.float64(signal)

	f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period)
	f0 = pyworld.stonemask(signal, f0, timeaxis, fs)
	spectrogram = pyworld.cheaptrick(signal, f0, timeaxis, fs)	

	return spectrogram
Esempio n. 13
0
def collect_features(x, fs):
    fftlen = pyworld.get_cheaptrick_fft_size(fs)
    alpha = pysptk.util.mcepalpha(fs)
    order = 25
    frame_period = 5
    hop_length = int(fs * (frame_period * 0.001))

    x = x.astype(np.float64)
    _f0, _timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    f0 = pyworld.stonemask(x, _f0, _timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, _timeaxis, fs)
    mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
    return mc
Esempio n. 14
0
def pitch(y):
    # Extract Pitch/f0 from raw waveform using PyWORLD
    y = y.astype(np.float64)
    '''
    f0_floor : float
        Lower F0 limit in Hz.
        Default: 71.0
    f0_ceil : float
        Upper F0 limit in Hz.
        Default: 800.0
    '''
    f0, timeaxis = pw.dio(y, hp.sample_rate, frame_period=hp.hop_length/hp.sample_rate*1000)  # For hop size 256 frame period is 11.6 ms
    return f0 #   (Number of Frames) = (654,)
Esempio n. 15
0
def get_f0(wav: np.array, hop_length: int, sr: int = 22050):
    """
    Parse f0 feature from given wave with using WORLD Vocoder
    :param wav: an array of wave
    :param hop_length: hop(stride) length
    :param sr: sample rate of wave
    :return: f0 feature
    """
    x = librosa.util.pad_center(wav, len(wav), mode='reflect').astype('double')
    _f0, t = pyworld.dio(x, sr, frame_period=hop_length / sr *
                         1e+3)  # raw pitch extractor
    f0 = pyworld.stonemask(x, _f0, t, sr)  # pitch refinement
    return f0.astype(np.float32)
Esempio n. 16
0
    def main(args):
        if os.path.isdir('test'):
            rmtree('test')
        os.mkdir('test')

        x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"]))
        # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

        # 1. A convient way
        f0, sp, ap = pw.wav2world(x, fs)  # use default options
        y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

        # 2. Step by step
        # 2-1 Without F0 refinement
        _f0, t = pw.dio(x,
                        fs,
                        f0_floor=50.0,
                        f0_ceil=600.0,
                        channels_in_octave=2,
                        frame_period=args.frame_period,
                        speed=args.speed)
        _sp = pw.cheaptrick(x, _f0, t, fs)
        _ap = pw.d4c(x, _f0, t, fs)
        _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
        sf.write('test/y_without_f0_refinement.wav', _y, fs)

        # 2-2 DIO with F0 refinement (using Stonemask)
        f0 = pw.stonemask(x, _f0, t, fs)
        sp = pw.cheaptrick(x, f0, t, fs)
        ap = pw.d4c(x, f0, t, fs)
        y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
        # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
        sf.write('test/y_with_f0_refinement.wav', y, fs)

        # 2-3 Harvest with F0 refinement (using Stonemask)
        _f0_h, t_h = pw.harvest(x, fs)
        f0_h = pw.stonemask(x, _f0_h, t_h, fs)
        sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
        ap_h = pw.d4c(x, f0_h, t_h, fs)
        y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
        # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
        sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

        # Comparison
        savefig('test/wavform.png', [x, _y, y])
        savefig('test/sp.png', [_sp, sp])
        savefig('test/ap.png', [_ap, ap], log=False)
        savefig('test/f0.png', [_f0, f0])

        print('Please check "test" directory for output files')
Esempio n. 17
0
def test_diff_vibrato():
    sr, x = wavfile.read(__test_wav_file)

    frame_period = 5
    frame_shift = int(frame_period * 0.001 * sr)
    sr_f0 = int(sr / frame_shift)

    f0, timeaxis = pyworld.dio(x.astype(np.float64), sr, frame_period=frame_period)
    f0 = pyworld.stonemask(x.astype(np.float64), f0, timeaxis, sr)

    f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8)
    vib = f0 - f0_smooth

    assert vib.shape == (len(f0),)
Esempio n. 18
0
    def collect_features(self, wav_path, label_path):
        fs, x = wavfile.read(wav_path)
        x = x.astype(np.float64)
        if hp_acoustic.use_harvest:
            f0, timeaxis = pyworld.harvest(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
        else:
            f0, timeaxis = pyworld.dio(
                x, fs, frame_period=hp_acoustic.frame_period,
                f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil)
            f0 = pyworld.stonemask(x, f0, timeaxis, fs)
        spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
        aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

        bap = pyworld.code_aperiodicity(aperiodicity, fs)
        if self.alpha is None:
            self.alpha = pysptk.util.mcepalpha(fs)
        mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha)
        f0 = f0[:, None]
        lf0 = f0.copy()
        nonzero_indices = np.nonzero(f0)
        lf0[nonzero_indices] = np.log(f0[nonzero_indices])
        if hp_acoustic.use_harvest:
            # https://github.com/mmorise/World/issues/35#issuecomment-306521887
            vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None]
        else:
            vuv = (lf0 != 0).astype(np.float32)
        lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind)

        # Parameter trajectory smoothing
        if hp_acoustic.mod_spec_smoothing:
            hop_length = int(fs * (hp_acoustic.frame_period * 0.001))
            modfs = fs / hop_length
            mgc = P.modspec_smoothing(
                mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff)

        mgc = P.delta_features(mgc, hp_acoustic.windows)
        lf0 = P.delta_features(lf0, hp_acoustic.windows)
        bap = P.delta_features(bap, hp_acoustic.windows)

        features = np.hstack((mgc, lf0, vuv, bap))

        # Cut silence frames by HTS alignment
        labels = hts.load(label_path)
        features = features[:labels.num_frames()]
        indices = labels.silence_frame_indices()
        features = np.delete(features, indices, axis=0)

        return features.astype(np.float32)
Esempio n. 19
0
def extract_pitch(waveform: torch.Tensor,
                  sample_rate: int,
                  output_path: Optional[Path] = None,
                  hop_length: int = 256,
                  log_scale: bool = True,
                  phoneme_durations: Optional[List[int]] = None):
    if output_path is not None and output_path.is_file():
        return

    try:
        import pyworld
    except ImportError:
        raise ImportError("Please install PyWORLD: pip install pyworld")

    _waveform = waveform.squeeze(0).double().numpy()
    pitch, t = pyworld.dio(_waveform,
                           sample_rate,
                           frame_period=hop_length / sample_rate * 1000)
    pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate)

    if phoneme_durations is not None:
        pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations))
        try:
            from scipy.interpolate import interp1d
        except ImportError:
            raise ImportError("Please install SciPy: pip install scipy")
        nonzero_ids = np.where(pitch != 0)[0]
        interp_fn = interp1d(
            nonzero_ids,
            pitch[nonzero_ids],
            fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]),
            bounds_error=False,
        )
        pitch = interp_fn(np.arange(0, len(pitch)))
        d_cumsum = np.cumsum(np.concatenate([np.array([0]),
                                             phoneme_durations]))
        pitch = np.array([
            np.mean(pitch[d_cumsum[i - 1]:d_cumsum[i]])
            for i in range(1, len(d_cumsum))
        ])
        assert len(pitch) == len(phoneme_durations)

    if log_scale:
        pitch = np.log(pitch + 1)

    if output_path is not None:
        np.save(output_path.as_posix(), pitch)
    else:
        return pitch
def pitch(wav, hparams, pitch_func="harvest"):
    frame_period = (hparams.hop_size / (0.001 * hparams.sample_rate))

    if isinstance(wav[0], np.float32):
        wav = wav.astype(np.double)

    if pitch_func == "harvest":
        f0, timeaxis = pyworld.harvest(wav, hparams.sample_rate, frame_period=frame_period)
    elif pitch_func == "dio":
        f0, timeaxis = pyworld.dio(wav, hparams.sample_rate, frame_period=frame_period)
    else:
        print("Invalid pitch function.")
        exit(-1)

    return np.nan_to_num(f0)
Esempio n. 21
0
 def make_mel_f0(self, wav):
     # make mel-spectrogram
     mel = librosa.feature.melspectrogram(wav, self.sampling_rate,
                                          **self.mel_config)
     mel = np.log(np.abs(mel).clip(1e-5, 10)).astype(np.float32)
     # make fundamental frequency
     wav = wav.astype(np.float)
     _f0, t = pw.dio(wav,
                     self.sampling_rate,
                     frame_period=self.f0_frame_period)
     f0 = pw.stonemask(wav, _f0, t, self.sampling_rate)
     wav = torch.from_numpy(wav.astype(np.float32))
     mel = torch.from_numpy(mel).T
     f0 = torch.from_numpy(f0.astype(np.float32)).unsqueeze(-1)
     return wav, mel, f0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    x, fs = sf.read('utterance/vaiueo2d.wav')
    # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64)

    # 1. A convient way
    f0, sp, ap = pw.wav2world(x, fs)    # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    savefig('test/wavform.png', [x, _y, y])
    savefig('test/sp.png', [_sp, sp])
    savefig('test/ap.png', [_ap, ap], log=False)
    savefig('test/f0.png', [_f0, f0])

    print('Please check "test" directory for output files')
Esempio n. 23
0
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms,
                         mcep_order, windows):
    fs, audio = wavfile.read(wav_path)
    audio = audio.astype(np.float64) / 2**15
    if fs != sampling_rate:
        audio = audio.astype(np.float32)
        audio = librosa.resample(audio, fs, sampling_rate)
        audio = (audio * 2**15).astype(np.float64)
    # extract f0
    f0, timeaxis = pyworld.dio(audio,
                               sampling_rate,
                               frame_period=hop_size_in_ms)
    # modify f0
    f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate)
    # voiced/unvoiced flag
    vuv = (f0 > 0)[:, None].astype(np.float32)
    # calculate log f0
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    # interpolate f0 in log-domain
    lf0 = interp1d(lf0, kind='slinear')[:, None]

    # calculate mel-cepstrum
    spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate)
    mgc = pysptk.sp2mc(spectrogram,
                       order=mcep_order,
                       alpha=pysptk.util.mcepalpha(sampling_rate))
    # calculate aperiodicity parameter
    aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate)
    bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate)

    # calculate dynamic features
    mgc = apply_delta_windows(mgc, windows)
    lf0 = apply_delta_windows(lf0, windows)
    bap = apply_delta_windows(bap, windows)

    feature = np.hstack((mgc, lf0, vuv, bap))

    # cut silence frames by HTS alignment
    labels = hts.load(lab_path)
    feature = feature[:labels.num_frames()]
    if labels.num_frames() > len(feature):
        return
    indices = labels.silence_frame_indices()
    feature = np.delete(feature, indices, axis=0)

    return feature.astype(np.float32)
Esempio n. 24
0
 def _calculate_f0(self, input: torch.Tensor) -> torch.Tensor:
     x = input.cpu().numpy().astype(np.double)
     f0, timeaxis = pyworld.dio(
         x,
         self.fs,
         f0_floor=self.f0min,
         f0_ceil=self.f0max,
         frame_period=self.frame_period,
     )
     f0 = pyworld.stonemask(x, f0, timeaxis, self.fs)
     if self.use_continuous_f0:
         f0 = self._convert_to_continuous_f0(f0)
     if self.use_log_f0:
         nonzero_idxs = np.where(f0 != 0)[0]
         f0[nonzero_idxs] = np.log(f0[nonzero_idxs])
     return input.new_tensor(f0.reshape(-1), dtype=torch.float)
Esempio n. 25
0
def test_trim_remove_zeros_frames():
    fs, x = wavfile.read(example_audio_file())
    frame_period = 5

    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)

    for mat in [spectrogram, aperiodicity]:
        trimmed = trim_zeros_frames(mat)
        assert trimmed.shape[1] == mat.shape[1]

    for mat in [spectrogram, aperiodicity]:
        trimmed = remove_zeros_frames(mat)
        assert trimmed.shape[1] == mat.shape[1]
Esempio n. 26
0
def collect_features(emotion):
    arr = []
    for count in range(0, num_files):
        count_n = count + 1
        path = '_' + str(emotion) + '/' + [str(count_n), ('0' + str(count_n))][count_n < 10] + '.wav'
        x, fs_ = sf.read(path)
        x = x.astype(np.float64)
        f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period)
        f0 = pyworld.stonemask(x, f0, time_axis, fs_)
        spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_)
        spectrogram = trim_zeros_frames(spectrogram)
        mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha)
        mc = mc.tolist()
        while len(mc) < 1000:
            mc.append(vuoto)
        arr.append(mc)
    return np.array(arr)
Esempio n. 27
0
def main(args):
    if os.path.isdir('test'):
        rmtree('test')
    os.mkdir('test')

    # Read speech sample
    x, fs = sf.read(args.input)

    # 1. A convenient way
    f0, sp, ap = pw.wav2world(x, fs)  # use default options
    y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period)

    # 2. Step by step
    # 2-1 Without F0 refinement
    _f0, t = pw.dio(x,
                    fs,
                    f0_floor=50.0,
                    f0_ceil=600.0,
                    channels_in_octave=2,
                    frame_period=args.frame_period,
                    speed=args.speed)
    _sp = pw.cheaptrick(x, _f0, t, fs)
    _ap = pw.d4c(x, _f0, t, fs)
    _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period)
    sf.write('test/y_without_f0_refinement.wav', _y, fs)

    # 2-2 DIO with F0 refinement (using Stonemask)
    f0 = pw.stonemask(x, _f0, t, fs)
    sp = pw.cheaptrick(x, f0, t, fs)
    ap = pw.d4c(x, f0, t, fs)
    y = pw.synthesize(f0, sp, ap, fs, args.frame_period)
    sf.write('test/y_with_f0_refinement.wav', y, fs)

    # 2-3 Harvest with F0 refinement (using Stonemask)
    _f0_h, t_h = pw.harvest(x, fs)
    f0_h = pw.stonemask(x, _f0_h, t_h, fs)
    sp_h = pw.cheaptrick(x, f0_h, t_h, fs)
    ap_h = pw.d4c(x, f0_h, t_h, fs)
    y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period)
    sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs)

    # Comparison
    save_image('test/wavform.png', [x, _y, y])
    save_image('test/sp.png', [_sp, sp])
    save_image('test/ap.png', [_ap, ap], log=False)
    save_image('test/f0.png', [_f0, f0])
Esempio n. 28
0
 def formant(self, val, f0_v):
     '''
         Change formant.
         val : formant rate
         f0_v: f0 rate
     '''
     f_rate = self.audio.frame_rate
     np_arr = numpy.array(
         self.audio.get_array_of_samples())  # pydub -> numpy.array 変換
     print(np_arr)
     _f0_val, _time = pyworld.dio(np_arr, f_rate)  # 基本周波数
     spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate)  # スペクトル包絡
     aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate)  # 非周期性指標
     spct_b = numpy.zeros_like(spct)
     for i in range(spct_b.shape[1]):
         spct_b[:, i] = spct[:, int(i / val)]
     self.audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate)
     return self
Esempio n. 29
0
def _extract_static_feats(wav, sr):
    f0, timeaxis = pyworld.dio(wav, sr, frame_period=5)
    spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr)
    aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr)

    mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr))
    f0 = f0[:, None]
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    vuv = (lf0 != 0).astype(np.float32)
    lf0 = interp1d(lf0, kind="slinear")
    bap = pyworld.code_aperiodicity(aperiodicity, sr)

    feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32)
    stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]]

    return feats, stream_sizes
Esempio n. 30
0
def shift_wav(in_file, shift=0):
    in_file = Path(in_file)
    x, fs = sf.read(str(in_file))
    _f0, t = pw.dio(x, fs)  # raw pitch extractor
    f0 = pw.stonemask(x, _f0, t, fs)  # pitch refinement
    sp = pw.cheaptrick(x, f0, t, fs)  # extract smoothed spectrogram
    ap = pw.d4c(x, f0, t, fs)  # extract aperiodicity

    shift_scale = 2**(shift / 12)

    y = pw.synthesize(f0 * shift_scale, sp, ap, fs)

    if shift >= 0:
        out_file = in_file.parent / f"{in_file.stem}+{shift}.wav"
    else:
        out_file = in_file.parent / f"{in_file.stem}-{-shift}.wav"

    sf.write(str(out_file), y, fs)
Esempio n. 31
0
def callback(in_data, frame_count, time_info, status):
    global f_scale
    global sp_scale
    # print(f_scale, sp_scale)
    np_data = np.fromstring(in_data, dtype=np.int16)
    np_stereo_data = np.reshape(np_data, (chunk, ch))
    np_l_data = np_stereo_data[:, 0]
    np_r_data = np_stereo_data[:, 1]
    np_lr_data = np_l_data / 2 + np_r_data / 2
    np_mono_data = np_lr_data.astype(np.float64)
    # print(np_mono_data.shape)

    of0, t = pw.dio(np_mono_data, rate, frame_period=frame_period)
    # print(of0.shape)
    # print(t.shape)
    f0 = pw.stonemask(np_mono_data, of0, t, rate)
    # print(f0.shape)
    sp = pw.cheaptrick(np_mono_data, f0, t, rate)
    # print(sp.shape)
    ap = pw.d4c(np_mono_data, f0, t, rate)
    # print(ap.shape)

    sp1 = np.zeros_like(sp)
    sp_rate = 1.0
    if sp_scale > 1.0:
        sp_rate = 1.0 / sp_scale
    else:
        sp_rate = sp_scale

    for f in range(sp.shape[1]):
        sp1[:, f] = sp[:, int(f * sp_rate)]

    # np_synthesized = pw.synthesize(f0, sp, ap, rate, frame_period)
    np_synthesized = pw.synthesize(f0 * f_scale, sp1, ap, rate, frame_period)
    # print(np_synthesized.shape)
    # np_synthesized.shape != np_mono_data.shape

    np_out_data = np.empty((chunk, ch), dtype=np.float64)
    np_out_data[:, 0] = np_synthesized[:chunk]
    np_out_data[:, 1] = np_synthesized[:chunk]

    out_data = np_out_data.flatten().astype(np.int16).tostring()

    return (out_data, pyaudio.paContinue)
Esempio n. 32
0
def _process_utterance(out_dir, index, speaker_id, wav_path, text):
    x, fs = librosa.load(wav_path, sr=config.fs)
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    spectrogram = trim_zeros_frames(spectrogram)
    mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha)
    timesteps = mc.shape[0]
    wav_id = wav_path.split("/")[-1].split('.')[0]
    mc_name = '{}-mc.npy'.format(wav_id)
    np.save(os.path.join(out_dir, mc_name), mc, allow_pickle=False)

    # compute lf0
    lf0 = f0.copy()
    nonzero_indices = np.nonzero(f0)
    lf0[nonzero_indices] = np.log(f0[nonzero_indices])
    # Return a tuple describing this training example:
    return mc_name, timesteps, text, speaker_id, lf0.tolist()
Esempio n. 33
0
 def collect_features(self, wav_path):
     fs, x = wavfile.read(wav_path)
     x = x.astype(np.float64)
     f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
     f0 = pyworld.stonemask(x, f0, timeaxis, fs)
     spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
     spectrogram = P.trim_zeros_frames(spectrogram)
     if self.alpha is None:
         self.alpha = pysptk.util.mcepalpha(fs)
     mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha)
     # Drop 0-th coefficient
     mgc = mgc[:, 1:]
     # 50Hz cut-off MS smoothing
     hop_length = int(fs * (hp.frame_period * 0.001))
     modfs = fs / hop_length
     mgc = P.modspec_smoothing(mgc, modfs, cutoff=50)
     # Add delta
     mgc = P.delta_features(mgc, hp.windows)
     return mgc.astype(np.float32)
Esempio n. 34
0
    def __call__(self, data: Wave, test=None):
        x = data.wave.astype(numpy.float64)
        fs = data.sampling_rate

        if self._f0_estimating_method == 'dio':
            _f0, t = pyworld.dio(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        else:
            from world4py.np import apis
            _f0, t = apis.harvest(
                x,
                fs,
                frame_period=self._frame_period,
                f0_floor=self._f0_floor,
                f0_ceil=self._f0_ceil,
            )
        f0 = pyworld.stonemask(x, _f0, t, fs)
        spectrogram = pyworld.cheaptrick(x, f0, t, fs)
        aperiodicity = pyworld.d4c(x, f0, t, fs)

        mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha)
        voiced = ~(f0 == 0)  # type: numpy.ndarray

        feature = AcousticFeature(
            f0=f0[:, None].astype(self._dtype),
            spectrogram=spectrogram.astype(self._dtype),
            aperiodicity=aperiodicity.astype(self._dtype),
            mfcc=mfcc.astype(self._dtype),
            voiced=voiced[:, None],
        )
        feature.validate()
        return feature
Esempio n. 35
0
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True):
    model.eval()

    hop_length = int(fs * (hp.frame_period * 0.001))
    x = x.astype(np.float64)
    f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period)
    f0 = pyworld.stonemask(x, f0, timeaxis, fs)
    spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs)
    aperiodicity = pyworld.d4c(x, f0, timeaxis, fs)
    alpha = pysptk.util.mcepalpha(fs)
    mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha)
    c0, mc = mc[:, 0], mc[:, 1:]
    static_dim = mc.shape[-1]
    mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50)
    mc = P.delta_features(mc, hp.windows).astype(np.float32)

    T = mc.shape[0]

    inputs = mc[:, :static_dim].copy()

    # Normalization
    mc_scaled = P.scale(mc, data_mean, data_std)

    mc_scaled = Variable(torch.from_numpy(mc_scaled))
    lengths = [len(mc_scaled)]

    # Add batch axis
    mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1))

    # For MLPG
    R = unit_variance_mlpg_matrix(hp.windows, T)
    R = torch.from_numpy(R)

    # Apply model
    if model.include_parameter_generation():
        # Case: models include parameter generation in itself
        # Mulistream features cannot be used in this case
        y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths)
    else:
        # Case: generic models (can be sequence model)
        assert hp.has_dynamic_features is not None
        y_hat = model(mc_scaled, lengths=lengths)
        y_hat_static = multi_stream_mlpg(
            y_hat, R, hp.stream_sizes, hp.has_dynamic_features)

    mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim)

    # Denormalize
    mc_static_pred = P.inv_scale(
        mc_static_pred, data_mean[:static_dim], data_std[:static_dim])

    outputs = mc_static_pred.copy()

    if diffvc:
        mc_static_pred = mc_static_pred - mc[:, :static_dim]

    mc = np.hstack((c0[:, None], mc_static_pred))
    if diffvc:
        mc[:, 0] = 0  # remove power coefficients
        engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha),
                             hopsize=hop_length)
        b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha)
        waveform = engine.synthesis(x, b)
    else:
        fftlen = pyworld.get_cheaptrick_fft_size(fs)
        spectrogram = pysptk.mc2sp(
            mc.astype(np.float64), alpha=alpha, fftlen=fftlen)
        waveform = pyworld.synthesize(
            f0, spectrogram, aperiodicity, fs, hp.frame_period)

    return waveform, inputs, outputs