def test_world_array_order(): wav = kwiiyatta.load_wav(dataset.CLB_WAV) f0, timeaxis = pyworld.dio(wav.data, wav.fs) f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs) spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs) ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs) pyworld.synthesize(f0, spec, ape, wav.fs) data = wav.data[::2] expected_msg = 'ndarray is not C-contiguous' with pytest.raises(ValueError) as e: f0, timeaxis = pyworld.dio(data, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.cheaptrick(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.d4c(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs) assert expected_msg == str(e.value)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"])) # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def collect_features(x, fs): x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def analysis_resynthesis(signal): # 音響特徴量の抽出 f0, t = pw.dio(signal, sample_rate) # 基本周波数の抽出 f0 = pw.stonemask(signal, f0, t, sample_rate) # refinement sp = pw.cheaptrick(signal, f0, t, sample_rate) # スペクトル包絡の抽出 ap = pw.d4c(signal, f0, t, sample_rate) # 非周期性指標の抽出 # ピッチシフト modified_f0 = f0_rate * f0 # フォルマントシフト(周波数軸の一様な伸縮) modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / sp_rate)] else: modified_sp[:, f] = sp[:, int(sp_rate * f)] else: modified_sp[:, f] = sp[:, f] # 再合成 synth = pw.synthesize(modified_f0, modified_sp, ap, sample_rate) return synth
def analyze(wav, fs=FS, minf0=MINF0, maxf0=MAXF0, fperiod=SHIFTMS, fftl=FFTL, f0=None, time_axis=None): """ f0 estimation w/o f0_floor & f0_ceil Args: minf0: Never used maxf0: Never used Returns: (time_axis, fundamental frequency, spectral envelope, aperiodicity) """ if f0 is None or time_axis is None: _f0, time_axis = pw.harvest(wav, fs, f0_floor=60.0, frame_period=fperiod) f0 = pw.stonemask(wav, _f0, time_axis, fs) sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl) ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl) return time_axis, f0, sp, ap
def analyze_range(wav, fs=FS, minf0=MINF0, maxf0=MAXF0, fperiod=SHIFTMS, fftl=FFTL, f0=None, time_axis=None): """ f0 estimation w/ f0_floor & f0_ceil Args: f0: Given f0. If not provided, estimated by WORLD harvest/stonemask from waveform. Returns: (time_axis, fundamental frequency, spectral envelope, aperiodicity) """ if f0 is None or time_axis is None: # pyworld.harvest: Estimate fo. _f0, time_axis = pw.harvest(wav, fs, f0_floor=minf0, f0_ceil=maxf0, frame_period=fperiod) # pyworld.stonemask: Refine fo. f0 = pw.stonemask(wav, _f0, time_axis, fs) # pyworld.cheaptrick: Spectral envelope estimation. sp = pw.cheaptrick(wav, f0, time_axis, fs, fft_size=fftl) # pyworld.d4c: Aperiodicity estimation. ap = pw.d4c(wav, f0, time_axis, fs, fft_size=fftl) return time_axis, f0, sp, ap
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3): """ WAV音声データから話者情報を取り除いたWAV音声データを作成 label音声からinput音声作成用 :param path: :param f0Value: :param sp_strechRatio: :return: """ waveNDArray = waveNDArray.astype(np.float) _f0, t = pw.dio(waveNDArray, fs) # 基本周波数の抽出 f0 = pw.stonemask(waveNDArray, _f0, t, fs) # 基本周波数の修正 sp = pw.cheaptrick(waveNDArray, f0, t, fs) # スペクトル包絡の抽出 ap = pw.d4c(waveNDArray, f0, t, fs) # 非周期性指標の抽出 f0_fixed0 = np.ones(f0.shape) * f0Value f0_median = np.median(f0) sp_median = np.median(sp) ap_median = np.median(ap) # SPを高周波方向に伸縮 sp2 = np.ones_like(sp)*np.min(sp) for f in range(sp2.shape[1]): if(int(f / sp_strechRatio) >= sp.shape[1]): break sp2[:, f] = sp[:, int(f / sp_strechRatio)] # SP/APに正規分布ノイズ sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape) ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape) #ガウシアンフィルタ sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s) ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s) # 音声復元 synthesized = pw.synthesize(f0_fixed0, sp, ap, fs) return synthesized
def extract_f0(wav_dir, speaker_id_pos=-4): wav_file_list = get_list_of_files(wav_dir) wav_file_list = [fname for fname in wav_file_list if file_filters(fname)] with open("jvs_speaker_info.json", "r") as f: speaker_info = json.load(f) for fname in progressbar(wav_file_list, redirect_stdout=True): print(fname) speaker_name = fname.split("/")[speaker_id_pos] x, fs = librosa.load(fname, sr=None) x = x.astype(np.float64) _f0, t = pyworld.dio( x, fs, # f0_floor=75, f0_ceil=400, f0_floor=speaker_info[speaker_name]["f0_min"], f0_ceil=speaker_info[speaker_name]["f0_max"], frame_period=12.5) f0 = pyworld.stonemask(x, _f0, t, fs) f0[f0 < 1.0] = 1.0 f0 = np.log2(f0).astype(np.float32) fname = fname.replace("wav24", "f0_24k") fname = fname.replace(".wav", "") fname_tokens = fname.split('/') file_name = fname_tokens[-1] output_dir = "/".join(fname_tokens[:speaker_id_pos + 1]) if not exists(output_dir): os.makedirs(output_dir) np.save(join(output_dir, file_name), f0) print("Finished!")
def data_extraction(np_data, rate): np_data = np_data.astype(np.float) _f0, t = pw.harvest(np_data, rate) f0 = pw.stonemask(np_data, _f0, t, rate) sp = pw.cheaptrick(np_data, f0, t, rate) ap = pw.d4c(np_data, f0, t, rate) return f0, sp, ap
def collect_features(self, wav_path): # x: Raw audio, (Sample_length, ) x, fs = librosa.load(wav_path, sr=self.target_sr, mono=True, dtype=np.float64) # f0: F0, (Frame_length, ) # lf0: log(f0) --> interp1d (Frame_length, ) # vuv: voice/unvoiced (Frame_length, ) f0, timeaxis = pyworld.dio(x, self.target_sr, frame_period=self.hop_sz_in_ms) f0 = pyworld.stonemask(x, f0, timeaxis, fs) lf0 = f0.copy() lf0[np.nonzero(f0)] = np.log(f0[np.nonzero(f0)]) lf0 = interp1d(lf0, kind="slinear") vuv = (lf0 != 0).astype(np.float32) # spec: Spectrogram, (Frame_length x Dim), Dim = 513 # bap: coded aperiodicity, (Frame_length, ) # mgc: mel-cepstrum, (Frame_length x Dim), Dim = 60 spec = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spec, order=59, alpha=pysptk.util.mcepalpha(fs)) # Stacking Features: total dimesnion = 64 features = np.hstack((f0[:,None], lf0[:,None], vuv[:,None], bap, mgc, spec)) return features.astype(np.float32)
def wav2world(wavfile, frame_period): wav, fs = librosa.load(wavfile, sr=hp.sample_rate, dtype=np.float64) if hp.use_harvest: f0, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period) else: f0, timeaxis = pyworld.dio(wav, fs, frame_period=frame_period) f0 = pyworld.stonemask(wav, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(wav, f0, timeaxis, fs) aperiodicity = pyworld.d4c(wav, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) hp.num_bap = bap.shape[1] alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.num_mgc - 1, alpha=alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) #print(mgc.shape,lf0.shape,vuv.shape,bap.shape) features = np.hstack((mgc, lf0, vuv, bap)) return features.astype(np.float32)
def convert(signal): f0_rate = 2.4 sp_rate = 0.78 sample_rate = 16000 f0, t = pyworld.dio(signal, sample_rate) f0 = pyworld.stonemask(signal, f0, t, sample_rate) sp = pyworld.cheaptrick(signal, f0, t, sample_rate) ap = pyworld.d4c(signal, f0, t, sample_rate) modified_f0 = f0_rate * f0 # フォルマントシフト(周波数軸の一様な伸縮) modified_sp = np.zeros_like(sp) sp_range = int(modified_sp.shape[1] * sp_rate) for f in range(modified_sp.shape[1]): if (f < sp_range): if sp_rate >= 1.0: modified_sp[:, f] = sp[:, int(f / sp_rate)] else: modified_sp[:, f] = sp[:, int(sp_rate * f)] else: modified_sp[:, f] = sp[:, f] y = pyworld.synthesize(modified_f0, modified_sp, ap, sample_rate) return y
def extract_spectrum(self, x, sample_rate): x = np.asarray(x) _f0, t = pw.dio(x, sample_rate, frame_period=12.5) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, sample_rate) # pitch refinement sp = pw.cheaptrick(x, f0, t, sample_rate) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, sample_rate) return sp, ap, f0
def extract_f0(self, **kwargs): if self._f0 is None: self._f0, self._timeaxis = pyworld.dio( self.data, self.fs, frame_period=self.frame_period, **kwargs) self._f0 = pyworld.stonemask(self.data, self._f0, self._timeaxis, self.fs) return self._f0
def get_conversion_data(audiodata, fs, refine_f0): """ Get A (without warping source dictionary) feature for conversion (sp, ap, f0) :param args: :param kwargs: :return: source dictionary (without warping) """ features = [] logging.info("Start building speaker A dictionary: Extracting feature for conversion (sp, ap, f0)") for audio in tqdm(audiodata): # Extract feature _f0, t = pw.dio(audio, fs) # raw pitch extractor if refine_f0: f0 = pw.stonemask(audio, _f0, t, fs) # pitch refinement else: f0 = _f0 sp = pw.cheaptrick(audio, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(audio, f0, t, fs) # extract aperiodicity # y = pw.synthesize(f0, sp, ap, fs) features.append({ 'sp': sp, 'ap': ap, 'f0': f0, 'fs': fs, 'sr': fs }) return features
def wav2pw(wavfile, sr=SR, fft_size=FFT_SIZE, frame_period=FRAME_PERIOD): x, _ = librosa.load(wavfile, sr=sr, mono=True, dtype=np.float64) _f0, t = pw.harvest(x, sr, frame_period=frame_period) f0 = pw.stonemask(x, _f0, t, sr) sp = pw.cheaptrick(x, f0, t, sr, fft_size=fft_size) ap = pw.d4c(x, f0, t, sr, fft_size=fft_size) return f0, sp, ap
def get_features(filename, *, winlen, winstep, n_mcep, mcep_alpha, minf0, maxf0, type): wav, sr = load(filename, sr=None) # get f0 x = wav.astype(float) _f0, t = world.harvest(x, sr, f0_floor=minf0, f0_ceil=maxf0, frame_period=winstep * 1000) f0 = world.stonemask(x, _f0, t, sr) window_size = int(sr * winlen) hop_size = int(sr * winstep) # get mel if type == 'mcc': spec = world.cheaptrick(x, f0, t, sr, f0_floor=minf0) h = sptk.sp2mc(spec, n_mcep - 1, mcep_alpha).T else: h = mfcc(x, sr, n_mfcc=n_mcep, n_fft=window_size, hop_length=hop_size) h = np.vstack((h, f0)) maxlen = len(x) // hop_size + 2 h = repeat_last_padding(h, maxlen) id = os.path.basename(filename).replace(".wav", "") return (id, x, h)
def compute_f0(self, x: np.ndarray) -> np.ndarray: """Compute pitch (f0) of a waveform using the same parameters used for computing melspectrogram. Args: x (np.ndarray): Waveform. Returns: np.ndarray: Pitch. Examples: >>> WAV_FILE = filename = librosa.util.example_audio_file() >>> from TTS.config import BaseAudioConfig >>> from TTS.utils.audio import AudioProcessor >>> conf = BaseAudioConfig(pitch_fmax=8000) >>> ap = AudioProcessor(**conf) >>> wav = ap.load_wav(WAV_FILE, sr=22050)[:5 * 22050] >>> pitch = ap.compute_f0(wav) """ assert self.pitch_fmax is not None, " [!] Set `pitch_fmax` before caling `compute_f0`." # align F0 length to the spectrogram length if len(x) % self.hop_length == 0: x = np.pad(x, (0, self.hop_length // 2), mode="reflect") f0, t = pw.dio( x.astype(np.double), fs=self.sample_rate, f0_ceil=self.pitch_fmax, frame_period=1000 * self.hop_length / self.sample_rate, ) f0 = pw.stonemask(x.astype(np.double), f0, t, self.sample_rate) return f0
def raw2WORLDfeatures(signal, fs=16000, fft_size=1024): _f0, t = pw.dio(signal, fs, f0_ceil=500) # raw pitch contour extractor f0 = pw.stonemask(signal, _f0, t, fs) # pitch refinement spectra = pw.cheaptrick(signal, f0, t, fs, fft_size=fft_size) aperiodicity = pw.d4c(signal, f0, t, fs, fft_size=fft_size) # extract aperiodicity return f0, spectra, aperiodicity
def save_features_to_array(path = Data_Directory): labels, _, _ = get_labels(path) print(labels) for label in labels: fundfreq_vectors = [] ap_vectors = [] mfcc_vectors = [] sp_vectors = [] wavfiles = [path + label + '/' + wavfile for wavfile in sorted(os.listdir(path + '/' + label))] for wavfile in wavfiles: print(wavfile) x, fs = sf.read(wavfile) _f0, t = pw.dio(x, fs) f0 = pw.stonemask(x, _f0, t, fs) fundfreq_vectors.append(f0) sp = pw.cheaptrick(x, f0, t, fs) sp_vectors.append(sp) ap = pw.d4c(x, f0, t, fs) ap_vectors.append(ap) mfcc = wav2mfcc(wavfile, max_pad_len=120) mfcc_vectors.append(mfcc) #print(mfcc_vectors.shape) np.save('features/mfcc_' + label + '.npy', mfcc_vectors) np.save('features/fundfreq_' + label + '.npy', fundfreq_vectors)
def get_para(data, fs): # This function is the same as wav2world. _fo, _time = pw.dio(data, fs) # 基本周波数の抽出 fo = pw.stonemask(data, _fo, _time, fs) # 基本周波数の修正 sp = pw.cheaptrick(data, fo, _time, fs) # スペクトル包絡の抽出 ap = pw.d4c(data, fo, _time, fs) # 非周期性指標の抽出 return fo, sp, ap
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def wave2world(data): """ Parameters ---------- data : float64 SamplingRate: 44100 ValueRange : [-1.0,1.0] Shape: (input_size) Returns ------- _f0 : float64 Shape: (N) _cepstrum : float64 Shape: (N, 64) _aperiodicity : float64 Shape: (N,513) NOTE: input_size is defined in config file. N is determined by input_size. """ sampling_rate = 44100 _f0, _t = pw.dio(data, sampling_rate, frame_period=10) _f0 = pw.stonemask(data, _f0, _t, sampling_rate) _cepstrum = pw.cheaptrick(data, _f0, _t, sampling_rate) _cepstrum = (np.log(_cepstrum) + 7) / 9 _cepstrum = np.clip(_cepstrum, -1.0, 1.0) _aperiodicity = pw.d4c(data, _f0, _t, sampling_rate) return _f0, _cepstrum.astype(np.float32), _aperiodicity
def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = pyworld.harvest( x, fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil, ) f0 = pyworld.stonemask(x, f0, t, fs) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) feature.validate() return feature
def get_target(x,fs,n_ap_channels,n_sp_channels): _f0, t = pw.dio(x,fs, f0_floor=120.0, f0_ceil=750.0, frame_period=8.0) f0_herz = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0_herz, t, fs) ap = pw.d4c(x, f0_herz, t, fs) # print(sp.shape) # plt.matshow(ap) # plt.show() ap=ap*20-18 arr=[] for i in range(sp.shape[0]): arr.append(np.interp(np.linspace(0,1025,n_ap_channels),np.arange(1025),ap[i])[np.newaxis,:]) _ap=np.concatenate(arr,axis=0) sp=np.log(sp) # plt.matshow(sp) # plt.show() arr=[] for i in range(sp.shape[0]): arr.append(np.interp(np.linspace(0,1025,n_sp_channels),np.arange(1025),sp[i])[np.newaxis,:]) _sp=np.concatenate(arr,axis=0) # mel=mel+20.0 # mel=np.where(mel>0,mel,0) # mel=mel/mel.max() # plt.matshow(mel) # plt.show() return _ap,_sp
def pre_process(file_name, training_dir): audio_file_name = training_dir + file_name + '.wav' lyrics_file_name = training_dir + 'Transcripts/' + file_name + '.txt' audio_data, sample_rate = soundfile.read(audio_file_name) audio_data = librosa.resample(audio_data, sample_rate, params.sample_rate) sample_rate = params.sample_rate harvest_frequency, timing = pyworld.harvest( audio_data, sample_rate, f0_floor=params.min_freq, f0_ceil=params.max_freq, frame_period=params.frame_period) frequency = pyworld.stonemask(audio_data, harvest_frequency, timing, sample_rate) audio_length = len(frequency) phoneme_data = extract_phoneme_data( [audio_file_name, lyrics_file_name, audio_length]) frequency_data = process_frequency(frequency) label_data = pd.concat([phoneme_data, frequency_data], axis=1) spectral_data, aperiodic_data = extract_timbre_data( [audio_data, frequency, timing, sample_rate]) return [spectral_data, aperiodic_data, label_data, frequency]
def wavfile2pw(filename, f0_ceil=F0_CEIL, fs=FS, fft_size=FFT_SIZE): """Speech analysis given the file name We use the PyWorld to extract feature, following the practice in: https://github.com/JeremyCCHsu/vae-npvc NOTE: The spectrum is normalized by energy and transformed to log scale. To be discussed here After transforming to the log scale, the spectrum will be further normalized to be in the range of [-1, 1] Args: filename: the wav file f0_ceil: maximum f0, note here we set the default to be 500, while praat suggest we set 250. this will result in many small values in high frequence, probably not learnable for a network fs: sampling frequency, librosa will handle the frequency conversion from the original wavfile fft_size: fft size Returns: f0: the pitch/ fundamental frequencys sp: spectogram ap: aperiodicity en: energy """ x, _ = librosa.load(filename, sr=fs, mono=True, dtype=np.float64) _f0, t = pw.dio(x, fs, f0_ceil=f0_ceil) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) en = np.sum(sp + EPSILON, axis=1, keepdims=True) sp = np.log10(sp / en) return f0, sp, ap, en
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') #, start=56640, stop=262560) sr = 32000 if osr != sr: y = librosa.resample(y, osr, sr) #使用harvest算法计算音频的基频F0 _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period) _f0 = pw.stonemask(y, _f0, t, sr) print(_f0.shape) #使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = code_harmonic(_sp, 60) print(_sp.shape, code_sp.shape) #计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) return _f0, _sp, code_sp, _ap, code_ap
def pyworld_featurize(audiofile): fs, x = wav.read(audiofile) print(x) print(fs) # corrects for 2 channel audio try: x = x[:, 0] except: pass x = np.array(np.ascontiguousarray(x), dtype=np.double) print(fs) print(x) _f0, t = pw.dio(x, fs) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity) features_0, labels_0 = stats(_f0, 'pitch') features_1, labels_1 = stats(_f0, 'pitch_refinement') features_2, labels_2 = stats(sp, 'smoothed_spectrogram') features_3, labels_3 = stats(ap, 'aperiodicity') features_0 = list(features_0) features_1 = list(features_1) features_2 = list(features_2) features_3 = list(features_3) features = features_0 + features_1 + features_2 + features_3 labels = labels_0 + labels_1 + labels_2 + labels_3 return features, labels
def wav2pw(x, fs=16000, fft_size=FFT_SIZE): ''' Extract WORLD feature from waveform ''' _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity return { 'f0': f0, 'sp': sp, 'ap': ap, }
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def __call__(self, data: Wave, test=None): x = data.wave.astype(numpy.float64) fs = data.sampling_rate if self._f0_estimating_method == 'dio': _f0, t = pyworld.dio( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) else: from world4py.np import apis _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None].astype(self._dtype), spectrogram=spectrogram.astype(self._dtype), aperiodicity=aperiodicity.astype(self._dtype), mfcc=mfcc.astype(self._dtype), voiced=voiced[:, None], ) feature.validate() return feature
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs