def wav2pw(x, fs=16000, fft_size=FFT_SIZE): ''' Extract WORLD feature from waveform ''' _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity return { 'f0': f0, 'sp': sp, 'ap': ap, }
def reconstruct_waveform(signal, spectrogram, aperiodicity, fs=16000, frame_period=5): """ Reconstructs the waveform from the spectrogram and MFCCs """ if type(signal) != 'numpy.float64': signal = np.float64(signal) f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period) f0 = pyworld.stonemask(signal, f0, timeaxis, fs) return pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period)
def forward(self, x): # Compute the hop length hop = int(1000 * self.block_size / self.sr) if self.method == "dio": f0 = dio(x.astype(np.float64), self.sr, frame_period=hop, f0_floor=50, f0_ceil=2000)[0] elif self.method == "crepe": f0 = crepe.predict(x, self.sr, step_size=hop, verbose=False)[1] return f0.astype(np.float)
def wav2pw(x, fs, fft_size=FFT_SIZE): ''' Extract WORLD feature from waveform ''' _f0, t = pw.dio(x, fs, f0_ceil=700, f0_floor=71.0, frame_period=FRMAE_PERIOD) # raw pitch extractor #f0, t = pw.harvest(x, fs, f0_ceil=700, f0_floor=71.0, frame_period=FRMAE_PERIOD) f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity return { 'f0': f0, 'sp': sp, 'ap': ap, }
def TestWORLD(waveform, sr, *args): """ Test performance of WORLD (pyworld) """ # extract features _f0, t = pw.dio(waveform, sr) # raw pitch extractor f0 = pw.stonemask(waveform, _f0, t, sr) # pitch refinement sp = pw.cheaptrick(waveform, f0, t, sr) # extract smoothed spectrogram ap = pw.d4c(waveform, f0, t, sr) # extract aperiodicity # synthesize waveform generated_waveform = pw.synthesize(f0, sp, ap, sr) return generated_waveform
def audio_synthesize(self, y): y = np.asarray(y, dtype=np.float) before_shape = y.shape # print("y_after",y.shape) _f0, t = pw.dio(y, FS) # raw pitch extractor f0 = pw.stonemask(y, _f0, t, FS) # pitch refinement sp = pw.cheaptrick(y, f0, t, FS) # extract smoothed spectrogram ap = pw.d4c(y, f0, t, FS) # extract aperiodicity data = pw.synthesize(f0 * self.f0_parameter, sp, ap, FS) y_new = data[0:before_shape[0]] y_new = np.asarray(y_new, dtype=np.int16) return y_new
def PitchAnalyze(self): # fs : sampling frequency, 音楽業界では44,100Hz # data : arrayの音声データが入る fs, data = wavfile.read(self.wav_file) # floatでないとworldは扱えない data = data.astype(np.float) _f0, _time = pw.dio(data, fs) # 基本周波数の抽出 f0 = pw.stonemask(data, _f0, _time, fs) # 基本周波数の修正 self.f0 = f0 return(f0)
def wav2mcep(WAV_FILE,dim): fs, data = wavfile.read(WAV_FILE) # floatでないとworldは扱えない data = data.astype(np.float) _f0, _time = pw.dio(data, fs) # 基本周波数の抽出。pw.dioは0.005秒ごとの基本周波数を測定し、numpyとして返す。 f0 = pw.stonemask(data, _f0, _time, fs) # 基本周波数の修正 sp = pw.cheaptrick(data, f0, _time, fs) # スペクトル包絡の抽出 ap = pw.d4c(data, f0, _time, fs) # 非周期性指標の抽出 mcep=pysptk.sp2mc(sp,dim,0.42) return torch.Tensor(mcep)
def process_utterance(in_dir, out_dir, basename): wav_path = os.path.join(in_dir, 'wavs', '{}.wav'.format(basename)) tg_path = os.path.join(out_dir, 'TextGrid', '{}.TextGrid'.format(basename)) # Get alignments textgrid = tgt.io.read_textgrid(tg_path) phone, duration, start, end = get_alignment( textgrid.get_tier_by_name('phones')) # '{A}{B}{$}{C}', $ represents silent phones text = '{' + '}{'.join(phone) + '}' text = text.replace('{$}', ' ') # '{A}{B} {C}' text = text.replace('}{', ' ') # '{A B} {C}' if start >= end: return None # Read and trim wav files _, wav = read(wav_path) wav = wav[int(hp.sampling_rate*start):int(hp.sampling_rate*end)].astype(np.float32) # Compute fundamental frequency f0, _ = pw.dio(wav.astype(np.float64), hp.sampling_rate, frame_period=hp.hop_length/hp.sampling_rate*1000) f0 = f0[:sum(duration)] # Compute mel-scale spectrogram and energy mel_spectrogram, energy = Audio.tools.get_mel_from_wav(torch.FloatTensor(wav)) mel_spectrogram = mel_spectrogram.numpy().astype(np.float32)[:, :sum(duration)] energy = energy.numpy().astype(np.float32)[:sum(duration)] if mel_spectrogram.shape[1] >= hp.max_seq_len: return None # Save alignment ali_filename = '{}-ali-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'alignment', ali_filename), duration, allow_pickle=False) # Save fundamental prequency f0_filename = '{}-f0-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'f0', f0_filename), f0, allow_pickle=False) # Save energy energy_filename = '{}-energy-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'energy', energy_filename), energy, allow_pickle=False) # Save spectrogram mel_filename = '{}-mel-{}.npy'.format(hp.dataset, basename) np.save(os.path.join(out_dir, 'mel', mel_filename), mel_spectrogram.T, allow_pickle=False) return '|'.join([basename, text]), max(f0), min([f for f in f0 if f != 0]), max(energy), min(energy), mel_spectrogram.shape[1]
def get_aperiodicity(signal, fs=16000, frame_period=5): """ Extract aperiodicity of a signal """ if type(signal) != 'numpy.float64': signal = np.float64(signal) f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period) f0 = pyworld.stonemask(signal, f0, timeaxis, fs) aperiodicity = pyworld.d4c(signal, f0, timeaxis, fs) return aperiodicity
def get_spectrogram(signal, fs=16000, frame_period=5): """ Extracts spectrogram from signal """ if type(signal) != 'numpy.float64': signal = np.float64(signal) f0, timeaxis = pyworld.dio(signal, fs, frame_period=frame_period) f0 = pyworld.stonemask(signal, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(signal, f0, timeaxis, fs) return spectrogram
def collect_features(x, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) alpha = pysptk.util.mcepalpha(fs) order = 25 frame_period = 5 hop_length = int(fs * (frame_period * 0.001)) x = x.astype(np.float64) _f0, _timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, _f0, _timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, _timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def pitch(y): # Extract Pitch/f0 from raw waveform using PyWORLD y = y.astype(np.float64) ''' f0_floor : float Lower F0 limit in Hz. Default: 71.0 f0_ceil : float Upper F0 limit in Hz. Default: 800.0 ''' f0, timeaxis = pw.dio(y, hp.sample_rate, frame_period=hp.hop_length/hp.sample_rate*1000) # For hop size 256 frame period is 11.6 ms return f0 # (Number of Frames) = (654,)
def get_f0(wav: np.array, hop_length: int, sr: int = 22050): """ Parse f0 feature from given wave with using WORLD Vocoder :param wav: an array of wave :param hop_length: hop(stride) length :param sr: sample rate of wave :return: f0 feature """ x = librosa.util.pad_center(wav, len(wav), mode='reflect').astype('double') _f0, t = pyworld.dio(x, sr, frame_period=hop_length / sr * 1e+3) # raw pitch extractor f0 = pyworld.stonemask(x, _f0, t, sr) # pitch refinement return f0.astype(np.float32)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"])) # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def test_diff_vibrato(): sr, x = wavfile.read(__test_wav_file) frame_period = 5 frame_shift = int(frame_period * 0.001 * sr) sr_f0 = int(sr / frame_shift) f0, timeaxis = pyworld.dio(x.astype(np.float64), sr, frame_period=frame_period) f0 = pyworld.stonemask(x.astype(np.float64), f0, timeaxis, sr) f0_smooth = extract_smoothed_f0(f0, sr_f0, cutoff=8) vib = f0 - f0_smooth assert vib.shape == (len(f0),)
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def extract_pitch(waveform: torch.Tensor, sample_rate: int, output_path: Optional[Path] = None, hop_length: int = 256, log_scale: bool = True, phoneme_durations: Optional[List[int]] = None): if output_path is not None and output_path.is_file(): return try: import pyworld except ImportError: raise ImportError("Please install PyWORLD: pip install pyworld") _waveform = waveform.squeeze(0).double().numpy() pitch, t = pyworld.dio(_waveform, sample_rate, frame_period=hop_length / sample_rate * 1000) pitch = pyworld.stonemask(_waveform, pitch, t, sample_rate) if phoneme_durations is not None: pitch = trim_or_pad_to_target_length(pitch, sum(phoneme_durations)) try: from scipy.interpolate import interp1d except ImportError: raise ImportError("Please install SciPy: pip install scipy") nonzero_ids = np.where(pitch != 0)[0] interp_fn = interp1d( nonzero_ids, pitch[nonzero_ids], fill_value=(pitch[nonzero_ids[0]], pitch[nonzero_ids[-1]]), bounds_error=False, ) pitch = interp_fn(np.arange(0, len(pitch))) d_cumsum = np.cumsum(np.concatenate([np.array([0]), phoneme_durations])) pitch = np.array([ np.mean(pitch[d_cumsum[i - 1]:d_cumsum[i]]) for i in range(1, len(d_cumsum)) ]) assert len(pitch) == len(phoneme_durations) if log_scale: pitch = np.log(pitch + 1) if output_path is not None: np.save(output_path.as_posix(), pitch) else: return pitch
def pitch(wav, hparams, pitch_func="harvest"): frame_period = (hparams.hop_size / (0.001 * hparams.sample_rate)) if isinstance(wav[0], np.float32): wav = wav.astype(np.double) if pitch_func == "harvest": f0, timeaxis = pyworld.harvest(wav, hparams.sample_rate, frame_period=frame_period) elif pitch_func == "dio": f0, timeaxis = pyworld.dio(wav, hparams.sample_rate, frame_period=frame_period) else: print("Invalid pitch function.") exit(-1) return np.nan_to_num(f0)
def make_mel_f0(self, wav): # make mel-spectrogram mel = librosa.feature.melspectrogram(wav, self.sampling_rate, **self.mel_config) mel = np.log(np.abs(mel).clip(1e-5, 10)).astype(np.float32) # make fundamental frequency wav = wav.astype(np.float) _f0, t = pw.dio(wav, self.sampling_rate, frame_period=self.f0_frame_period) f0 = pw.stonemask(wav, _f0, t, self.sampling_rate) wav = torch.from_numpy(wav.astype(np.float32)) mel = torch.from_numpy(mel).T f0 = torch.from_numpy(f0.astype(np.float32)).unsqueeze(-1) return wav, mel, f0
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def get_acoustic_feature(lab_path, wav_path, sampling_rate, hop_size_in_ms, mcep_order, windows): fs, audio = wavfile.read(wav_path) audio = audio.astype(np.float64) / 2**15 if fs != sampling_rate: audio = audio.astype(np.float32) audio = librosa.resample(audio, fs, sampling_rate) audio = (audio * 2**15).astype(np.float64) # extract f0 f0, timeaxis = pyworld.dio(audio, sampling_rate, frame_period=hop_size_in_ms) # modify f0 f0 = pyworld.stonemask(audio, f0, timeaxis, sampling_rate) # voiced/unvoiced flag vuv = (f0 > 0)[:, None].astype(np.float32) # calculate log f0 lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) # interpolate f0 in log-domain lf0 = interp1d(lf0, kind='slinear')[:, None] # calculate mel-cepstrum spectrogram = pyworld.cheaptrick(audio, f0, timeaxis, sampling_rate) mgc = pysptk.sp2mc(spectrogram, order=mcep_order, alpha=pysptk.util.mcepalpha(sampling_rate)) # calculate aperiodicity parameter aperiodicity = pyworld.d4c(audio, f0, timeaxis, sampling_rate) bap = pyworld.code_aperiodicity(aperiodicity, sampling_rate) # calculate dynamic features mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) feature = np.hstack((mgc, lf0, vuv, bap)) # cut silence frames by HTS alignment labels = hts.load(lab_path) feature = feature[:labels.num_frames()] if labels.num_frames() > len(feature): return indices = labels.silence_frame_indices() feature = np.delete(feature, indices, axis=0) return feature.astype(np.float32)
def _calculate_f0(self, input: torch.Tensor) -> torch.Tensor: x = input.cpu().numpy().astype(np.double) f0, timeaxis = pyworld.dio( x, self.fs, f0_floor=self.f0min, f0_ceil=self.f0max, frame_period=self.frame_period, ) f0 = pyworld.stonemask(x, f0, timeaxis, self.fs) if self.use_continuous_f0: f0 = self._convert_to_continuous_f0(f0) if self.use_log_f0: nonzero_idxs = np.where(f0 != 0)[0] f0[nonzero_idxs] = np.log(f0[nonzero_idxs]) return input.new_tensor(f0.reshape(-1), dtype=torch.float)
def test_trim_remove_zeros_frames(): fs, x = wavfile.read(example_audio_file()) frame_period = 5 x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) for mat in [spectrogram, aperiodicity]: trimmed = trim_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1] for mat in [spectrogram, aperiodicity]: trimmed = remove_zeros_frames(mat) assert trimmed.shape[1] == mat.shape[1]
def collect_features(emotion): arr = [] for count in range(0, num_files): count_n = count + 1 path = '_' + str(emotion) + '/' + [str(count_n), ('0' + str(count_n))][count_n < 10] + '.wav' x, fs_ = sf.read(path) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) mc = mc.tolist() while len(mc) < 1000: mc.append(vuoto) arr.append(mc) return np.array(arr)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') # Read speech sample x, fs = sf.read(args.input) # 1. A convenient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison save_image('test/wavform.png', [x, _y, y]) save_image('test/sp.png', [_sp, sp]) save_image('test/ap.png', [_ap, ap], log=False) save_image('test/f0.png', [_f0, f0])
def formant(self, val, f0_v): ''' Change formant. val : formant rate f0_v: f0 rate ''' f_rate = self.audio.frame_rate np_arr = numpy.array( self.audio.get_array_of_samples()) # pydub -> numpy.array 変換 print(np_arr) _f0_val, _time = pyworld.dio(np_arr, f_rate) # 基本周波数 spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate) # スペクトル包絡 aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate) # 非周期性指標 spct_b = numpy.zeros_like(spct) for i in range(spct_b.shape[1]): spct_b[:, i] = spct[:, int(i / val)] self.audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate) return self
def _extract_static_feats(wav, sr): f0, timeaxis = pyworld.dio(wav, sr, frame_period=5) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, sr) aperiodicity = pyworld.d4c(wav, f0, timeaxis, sr) mgc = pysptk.sp2mc(spectrogram, order=59, alpha=pysptk.util.mcepalpha(sr)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") bap = pyworld.code_aperiodicity(aperiodicity, sr) feats = np.hstack((mgc, lf0, vuv, bap)).astype(np.float32) stream_sizes = [mgc.shape[1], lf0.shape[1], vuv.shape[1], bap.shape[1]] return feats, stream_sizes
def shift_wav(in_file, shift=0): in_file = Path(in_file) x, fs = sf.read(str(in_file)) _f0, t = pw.dio(x, fs) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity shift_scale = 2**(shift / 12) y = pw.synthesize(f0 * shift_scale, sp, ap, fs) if shift >= 0: out_file = in_file.parent / f"{in_file.stem}+{shift}.wav" else: out_file = in_file.parent / f"{in_file.stem}-{-shift}.wav" sf.write(str(out_file), y, fs)
def callback(in_data, frame_count, time_info, status): global f_scale global sp_scale # print(f_scale, sp_scale) np_data = np.fromstring(in_data, dtype=np.int16) np_stereo_data = np.reshape(np_data, (chunk, ch)) np_l_data = np_stereo_data[:, 0] np_r_data = np_stereo_data[:, 1] np_lr_data = np_l_data / 2 + np_r_data / 2 np_mono_data = np_lr_data.astype(np.float64) # print(np_mono_data.shape) of0, t = pw.dio(np_mono_data, rate, frame_period=frame_period) # print(of0.shape) # print(t.shape) f0 = pw.stonemask(np_mono_data, of0, t, rate) # print(f0.shape) sp = pw.cheaptrick(np_mono_data, f0, t, rate) # print(sp.shape) ap = pw.d4c(np_mono_data, f0, t, rate) # print(ap.shape) sp1 = np.zeros_like(sp) sp_rate = 1.0 if sp_scale > 1.0: sp_rate = 1.0 / sp_scale else: sp_rate = sp_scale for f in range(sp.shape[1]): sp1[:, f] = sp[:, int(f * sp_rate)] # np_synthesized = pw.synthesize(f0, sp, ap, rate, frame_period) np_synthesized = pw.synthesize(f0 * f_scale, sp1, ap, rate, frame_period) # print(np_synthesized.shape) # np_synthesized.shape != np_mono_data.shape np_out_data = np.empty((chunk, ch), dtype=np.float64) np_out_data[:, 0] = np_synthesized[:chunk] np_out_data[:, 1] = np_synthesized[:chunk] out_data = np_out_data.flatten().astype(np.int16).tostring() return (out_data, pyaudio.paContinue)
def _process_utterance(out_dir, index, speaker_id, wav_path, text): x, fs = librosa.load(wav_path, sr=config.fs) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=config.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = trim_zeros_frames(spectrogram) mc = pysptk.sp2mc(spectrogram, order=config.order, alpha=config.alpha) timesteps = mc.shape[0] wav_id = wav_path.split("/")[-1].split('.')[0] mc_name = '{}-mc.npy'.format(wav_id) np.save(os.path.join(out_dir, mc_name), mc, allow_pickle=False) # compute lf0 lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) # Return a tuple describing this training example: return mc_name, timesteps, text, speaker_id, lf0.tolist()
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def __call__(self, data: Wave, test=None): x = data.wave.astype(numpy.float64) fs = data.sampling_rate if self._f0_estimating_method == 'dio': _f0, t = pyworld.dio( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) else: from world4py.np import apis _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None].astype(self._dtype), spectrogram=spectrogram.astype(self._dtype), aperiodicity=aperiodicity.astype(self._dtype), mfcc=mfcc.astype(self._dtype), voiced=voiced[:, None], ) feature.validate() return feature
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs