def test_world_array_order(): wav = kwiiyatta.load_wav(dataset.CLB_WAV) f0, timeaxis = pyworld.dio(wav.data, wav.fs) f0 = pyworld.stonemask(wav.data, f0, timeaxis, wav.fs) spec = pyworld.cheaptrick(wav.data, f0, timeaxis, wav.fs) ape = pyworld.d4c(wav.data, f0, timeaxis, wav.fs) pyworld.synthesize(f0, spec, ape, wav.fs) data = wav.data[::2] expected_msg = 'ndarray is not C-contiguous' with pytest.raises(ValueError) as e: f0, timeaxis = pyworld.dio(data, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: f0 = pyworld.stonemask(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.cheaptrick(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.d4c(data, f0, timeaxis, wav.fs) assert expected_msg == str(e.value) with pytest.raises(ValueError) as e: pyworld.synthesize(f0[::2], spec[::2], ape[::2], wav.fs) assert expected_msg == str(e.value)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read("./{}.wav".format(edited_files["Thinking_Out_Loud"])) # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') x, fs = sf.read('utterance/vaiueo2d.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # Comparison savefig('test/wavform.png', [x, _y, y]) savefig('test/sp.png', [_sp, sp]) savefig('test/ap.png', [_ap, ap], log=False) savefig('test/f0.png', [_f0, f0]) print('Please check "test" directory for output files')
def extractparam(self): global audio_file_name, x, fs, _f0, _sp, _ap x, fs = sf.read(audio_file_name) _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) print("done")
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') #, start=56640, stop=262560) sr = 32000 if osr != sr: y = librosa.resample(y, osr, sr) #使用harvest算法计算音频的基频F0 _f0, t = pw.harvest(y, sr, f0_floor=f0_min, f0_ceil=f0_max, frame_period=pw.default_frame_period) _f0 = pw.stonemask(y, _f0, t, sr) print(_f0.shape) #使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = code_harmonic(_sp, 60) print(_sp.shape, code_sp.shape) #计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) return _f0, _sp, code_sp, _ap, code_ap
def pyworld_featurize(audiofile): fs, x = wav.read(audiofile) print(x) print(fs) # corrects for 2 channel audio try: x = x[:, 0] except: pass x = np.array(np.ascontiguousarray(x), dtype=np.double) print(fs) print(x) _f0, t = pw.dio(x, fs) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity) features_0, labels_0 = stats(_f0, 'pitch') features_1, labels_1 = stats(_f0, 'pitch_refinement') features_2, labels_2 = stats(sp, 'smoothed_spectrogram') features_3, labels_3 = stats(ap, 'aperiodicity') features_0 = list(features_0) features_1 = list(features_1) features_2 = list(features_2) features_3 = list(features_3) features = features_0 + features_1 + features_2 + features_3 labels = labels_0 + labels_1 + labels_2 + labels_3 return features, labels
def analyze(self, x): """Analyze acoustic features based on WORLD analyze F0, spectral envelope, aperiodicity Paramters --------- x : array, shape (`T`) monoral speech signal in time domain Returns --------- f0 : array, shape (`T`,) F0 sequence spc : array, shape (`T`, `fftl / 2 + 1`) Spectral envelope sequence ap: array, shape (`T`, `fftl / 2 + 1`) aperiodicity sequence """ f0, time_axis = pyworld.harvest(x, self.fs, f0_floor=self.minf0, f0_ceil=self.maxf0, frame_period=self.shiftms) spc = pyworld.cheaptrick(x, f0, time_axis, self.fs, fft_size=self.fftl) ap = pyworld.d4c(x, f0, time_axis, self.fs, fft_size=self.fftl) assert spc.shape == ap.shape return f0, spc, ap
def anonymization(fs, waveNDArray, f0Value = 0, sp_strechRatio = np.random.uniform(0.6, 2, size=1), gaussian_s = 3): """ WAV音声データから話者情報を取り除いたWAV音声データを作成 label音声からinput音声作成用 :param path: :param f0Value: :param sp_strechRatio: :return: """ waveNDArray = waveNDArray.astype(np.float) _f0, t = pw.dio(waveNDArray, fs) # 基本周波数の抽出 f0 = pw.stonemask(waveNDArray, _f0, t, fs) # 基本周波数の修正 sp = pw.cheaptrick(waveNDArray, f0, t, fs) # スペクトル包絡の抽出 ap = pw.d4c(waveNDArray, f0, t, fs) # 非周期性指標の抽出 f0_fixed0 = np.ones(f0.shape) * f0Value f0_median = np.median(f0) sp_median = np.median(sp) ap_median = np.median(ap) # SPを高周波方向に伸縮 sp2 = np.ones_like(sp)*np.min(sp) for f in range(sp2.shape[1]): if(int(f / sp_strechRatio) >= sp.shape[1]): break sp2[:, f] = sp[:, int(f / sp_strechRatio)] # SP/APに正規分布ノイズ sp_noised = sp2 + np.random.normal(sp_median,sp_median/10,sp2.shape) ap_noised = ap + np.random.normal(ap_median,ap_median/10,ap.shape) #ガウシアンフィルタ sp_gaussian = scipy.ndimage.filters.gaussian_filter(sp_noised,gaussian_s) ap_gaussian = scipy.ndimage.filters.gaussian_filter(ap_noised,gaussian_s) # 音声復元 synthesized = pw.synthesize(f0_fixed0, sp, ap, fs) return synthesized
def get_conversion_data(audiodata, fs, refine_f0): """ Get A (without warping source dictionary) feature for conversion (sp, ap, f0) :param args: :param kwargs: :return: source dictionary (without warping) """ features = [] logging.info("Start building speaker A dictionary: Extracting feature for conversion (sp, ap, f0)") for audio in tqdm(audiodata): # Extract feature _f0, t = pw.dio(audio, fs) # raw pitch extractor if refine_f0: f0 = pw.stonemask(audio, _f0, t, fs) # pitch refinement else: f0 = _f0 sp = pw.cheaptrick(audio, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(audio, f0, t, fs) # extract aperiodicity # y = pw.synthesize(f0, sp, ap, fs) features.append({ 'sp': sp, 'ap': ap, 'f0': f0, 'fs': fs, 'sr': fs }) return features
def extract_spectrum(self, x, sample_rate): x = np.asarray(x) _f0, t = pw.dio(x, sample_rate, frame_period=12.5) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, sample_rate) # pitch refinement sp = pw.cheaptrick(x, f0, t, sample_rate) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, sample_rate) return sp, ap, f0
def worldDecompose( wave: np.ndarray, fs: int = SAMPLE_RATE, frame_period: float = 5.) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: ''' 音声をworldを用いてf0, spectral envelope, aperiodicityに分解 Parameters ---------- wave: np.ndarray 音声の波形データ fs: int, default SAMPLE_RATE サンプリング周波数 frame_period: float, default 5. フレームの時間的間隔 Returns ------- f0: np.ndarray フレームの基本周波数[hz] sp: np.ndarray スペクトル包絡 ap: np.ndarray 非周期性指標 ''' wave = wave.astype(np.float64) f0, timeaxis = pyworld.harvest(wave, fs, frame_period=frame_period, f0_floor=71., f0_ceil=800.) sp = pyworld.cheaptrick(wave, f0, timeaxis, fs) ap = pyworld.d4c(wave, f0, timeaxis, fs) return f0, sp, ap
def collect_features(x, fs): x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def extract_sp_world(x, f0, sr, hoptime, fft_size=None): # NOTE: # F0 values too low for used FFT size are handled as unvoiced in CheapTrick. # If this happens, we warn the user. f0 = f0.squeeze() f0 = np.ascontiguousarray( f0, dtype=np.float64) # pyworld requires C-contiguous float64 array # warn for very low f0 # fft_size = pyworld.get_cheaptrick_fft_size(sr) if fft_size is None else fft_size # f0_floor = pyworld.get_cheaptrick_f0_floor(sr, fft_size) # n_f0_too_low = int(np.sum(np.logical_and(f0 > 0, f0 <= f0_floor))) # if n_f0_too_low > 0: # warnings.warn('F0 too low (<= {:.2f}) for FFT size ({:d}) for {:d} samples'.format(f0_floor, fft_size, n_f0_too_low)) n_frames = f0.shape[0] t = np.arange(n_frames) * hoptime sp = pyworld.cheaptrick(x, f0, t, sr) if not np.all(np.isfinite(sp)): raise ValueError( 'Configuration or input signal caused NaNs in WORLD CheapTrick analysis' ) # NOTE: This seems to happen occassionally, e.g. taking 16kHz TIMIT audio, upsampling # it to 32kHz and performing WORLD analysis sp = 10 * np.log10(sp) # power to decibels return sp
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs): """world声码器语音转为频谱。""" # 分布提取参数 frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) ap_threshold = kwargs.get("ap_threshold", 0.85) f0_extractor = kwargs.get("f0_extractor", "dio") x = wav.astype(np.double) if f0_extractor == "dio": # 使用DIO算法计算音频的基频F0 f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil) elif f0_extractor == "harvest": f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) else: f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) # 使用CheapTrick算法计算音频的频谱包络 sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size) # SP降维 sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num) # 计算aperiodic参数 ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size) # AP降维 ap_enc = pw.code_aperiodicity(ap, sr) return f0, sp_enc, ap_enc
def synthesis(ori_path, aim_sp, aim_spkid): print('synthesizing ...') wav, _ = librosa.load(ori_path, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR) sp_per_timeaxis_before = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # 1024 压缩到 513 维 # ori_decoded_sp = pw.decode_spectral_envelope(ori_sp, hp.SR, fft_size=hp.N_FFT) # print('f0.shape = ') # print(f0) ap = pw.d4c(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) aim_decoded_sp = pw.decode_spectral_envelope( aim_sp, hp.SR, fft_size=hp.N_FFT) # 转换/解码 后的sp: print('解码后的513维度的aim_decoded_sp = ') print(aim_decoded_sp.shape) print(aim_decoded_sp[399][:]) synwav = pw.synthesize(f0, aim_decoded_sp, ap, hp.SR) print(f'synthesize done. path : ./convert_to_{aim_spkid}_test1.wav') librosa.output.write_wav(f'./convert_to_{aim_spkid}_test1.wav', synwav, sr=hp.SR)
def collect_features(self, path): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE): '''cal mcep given wav singnal the frame_period used only for pad_wav_to_get_fixed_frames ''' if ispad: wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs) else: wav = wav_ori #Harvest F0 extraction algorithm. f0, timeaxis = pyworld.harvest(wav, fs) #CheapTrick harmonic spectral envelope estimation algorithm. sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size) #D4C aperiodicity estimation algorithm. ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size) #feature reduction nxdim coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) #log coded_sp = coded_sp.T # dim x n res = { 'f0': f0, #n 'ap': ap, #n*fftsize//2+1 'sp': sp, #n*fftsize//2+1 'coded_sp': coded_sp, #dim * n } return res
def get_para(data, fs): # This function is the same as wav2world. _fo, _time = pw.dio(data, fs) # 基本周波数の抽出 fo = pw.stonemask(data, _fo, _time, fs) # 基本周波数の修正 sp = pw.cheaptrick(data, fo, _time, fs) # スペクトル包絡の抽出 ap = pw.d4c(data, fo, _time, fs) # 非周期性指標の抽出 return fo, sp, ap
def formant(self, val, f0_v): ''' Change formant. val : formant rate f0_v: f0 rate ''' f_rate = self.audio.frame_rate np_arr = np.array(self.audio.get_array_of_samples(), dtype=np.float64) # pydub --> np.array(float64) 変換 # print(np_arr, f_rate) _f0_val, _time = pyworld.dio(np_arr, f_rate) # 基本周波数 spct = pyworld.cheaptrick(np_arr, _f0_val, _time, f_rate) # スペクトル包絡 aper = pyworld.d4c(np_arr, _f0_val, _time, f_rate) # 非周期性指標 spct_b = np.zeros_like(spct) for i in range(spct_b.shape[1]): spct_b[:, i] = spct[:, int(i / val)] ef_audio = pyworld.synthesize(_f0_val * f0_v, spct_b, aper, f_rate) ef_audio = ef_audio.astype(np.int16).tobytes() # print(ef_audio) # print(type(ef_audio)) new_audio = AudioSegment( ef_audio, sample_width=self.audio.sample_width, frame_rate=f_rate, channels=self.audio.channels, ) self.audio = new_audio return self
def extract(cls, wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = cls.extract_f0(x=x, fs=fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced: numpy.ndarray = ~(f0 == 0) if len(x) % fft_length > 0: f0 = f0[:-1] t = t[:-1] sp = sp[:-1] ap = ap[:-1] mc = mc[:-1] coded_ap = coded_ap[:-1] voiced = voiced[:-1] feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) return feature
def wave2world(data): """ Parameters ---------- data : float64 SamplingRate: 44100 ValueRange : [-1.0,1.0] Shape: (input_size) Returns ------- _f0 : float64 Shape: (N) _cepstrum : float64 Shape: (N, 64) _aperiodicity : float64 Shape: (N,513) NOTE: input_size is defined in config file. N is determined by input_size. """ sampling_rate = 44100 _f0, _t = pw.dio(data, sampling_rate, frame_period=10) _f0 = pw.stonemask(data, _f0, _t, sampling_rate) _cepstrum = pw.cheaptrick(data, _f0, _t, sampling_rate) _cepstrum = (np.log(_cepstrum) + 7) / 9 _cepstrum = np.clip(_cepstrum, -1.0, 1.0) _aperiodicity = pw.d4c(data, _f0, _t, sampling_rate) return _f0, _cepstrum.astype(np.float32), _aperiodicity
def world_features(wav, sr, fft_size, dim): f0, timeaxis = pyworld.harvest(wav, sr) sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size) ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) coded_sp = pyworld.code_spectral_envelope(sp, sr, dim) return f0, timeaxis, sp, ap, coded_sp
def world_decompose(wav, fs): wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, f0_floor=71.0, f0_ceil=800.0, frame_period=hp.duration) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24): """ Extract a F0 sequence and a MCEP sequence from a single waveform Args: wav (np.ndarray(1,T)): waveform fs : frame_period (float): [ms] MCEPdim (int): dimension of Mel CEPstral analysis Returns: tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period)) """ wav = wav.astype(np.float64) # np.ndarray -> np.ndarray(number is float64) f0seq, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period, f0_floor=71.0, f0_ceil=800.0) spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs) MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim) print( f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}" ) return f0seq, MCEPseq.T.astype(np.float32)
def wavfile2pw(filename, f0_ceil=F0_CEIL, fs=FS, fft_size=FFT_SIZE): """Speech analysis given the file name We use the PyWorld to extract feature, following the practice in: https://github.com/JeremyCCHsu/vae-npvc NOTE: The spectrum is normalized by energy and transformed to log scale. To be discussed here After transforming to the log scale, the spectrum will be further normalized to be in the range of [-1, 1] Args: filename: the wav file f0_ceil: maximum f0, note here we set the default to be 500, while praat suggest we set 250. this will result in many small values in high frequence, probably not learnable for a network fs: sampling frequency, librosa will handle the frequency conversion from the original wavfile fft_size: fft size Returns: f0: the pitch/ fundamental frequencys sp: spectogram ap: aperiodicity en: energy """ x, _ = librosa.load(filename, sr=fs, mono=True, dtype=np.float64) _f0, t = pw.dio(x, fs, f0_ceil=f0_ceil) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) en = np.sum(sp + EPSILON, axis=1, keepdims=True) sp = np.log10(sp / en) return f0, sp, ap, en
def generate_changed_voice(model, input_path): fs, x = wavfile.read(input_path) x = x.astype(np.float64) if len(x.shape) > 1: x = x.mean(axis=1) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = P.modspec_smoothing(mc, FS / HOP_LENGHT, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) gen_data = model.predict(mc) gen_data = np.hstack([c0.reshape((-1, 1)), gen_data]) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( gen_data.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) mgc = pysptk.sp2mc(spectrogram, order=order, alpha=pysptk.util.mcepalpha(fs)) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) vuv = (lf0 != 0).astype(np.float32) lf0 = interp1d(lf0, kind="slinear") mgc = apply_delta_windows(mgc, windows) lf0 = apply_delta_windows(lf0, windows) bap = apply_delta_windows(bap, windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def raw2WORLDfeatures(signal, fs=16000, fft_size=1024): _f0, t = pw.dio(signal, fs, f0_ceil=500) # raw pitch contour extractor f0 = pw.stonemask(signal, _f0, t, fs) # pitch refinement spectra = pw.cheaptrick(signal, f0, t, fs, fft_size=fft_size) aperiodicity = pw.d4c(signal, f0, t, fs, fft_size=fft_size) # extract aperiodicity return f0, spectra, aperiodicity
def get_target(x, fs, n_ap_channels, n_sp_channels, f0): _f0, t = pw.dio(x, fs, f0_floor=75.0, f0_ceil=1000.0, frame_period=8.0) f0_herz = f0[:_f0.shape[0]] f0_herz[_f0 < 1.0] = 0.0 sp = pw.cheaptrick(x, f0_herz, t, fs) ap = pw.d4c(x, f0_herz, t, fs) # print(sp.shape) # plt.matshow(ap) # plt.show() ap = ap * 20 - 18 arr = [] for i in range(sp.shape[0]): arr.append( np.interp(np.linspace(0, 1025, n_ap_channels), np.arange(1025), ap[i])[np.newaxis, :]) _ap = np.concatenate(arr, axis=0) sp = np.log(sp) # plt.matshow(sp) # plt.show() arr = [] for i in range(sp.shape[0]): arr.append( np.interp(np.linspace(0, 1025, n_sp_channels), np.arange(1025), sp[i])[np.newaxis, :]) _sp = np.concatenate(arr, axis=0) # mel=mel+20.0 # mel=np.where(mel>0,mel,0) # mel=mel/mel.max() # plt.matshow(mel) # plt.show() return _ap, _sp, f0_herz
def extract(wave: Wave, frame_period, f0_floor, f0_ceil, fft_length, order, alpha, dtype): x = wave.wave.astype(numpy.float64) fs = wave.sampling_rate f0, t = pyworld.harvest( x, fs, frame_period=frame_period, f0_floor=f0_floor, f0_ceil=f0_ceil, ) f0 = pyworld.stonemask(x, f0, t, fs) sp = pyworld.cheaptrick(x, f0, t, fs, fft_size=fft_length) ap = pyworld.d4c(x, f0, t, fs, fft_size=fft_length) mc = pysptk.sp2mc(sp, order=order, alpha=alpha) coded_ap = pyworld.code_aperiodicity(ap, fs) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None], sp=sp, ap=ap, coded_ap=coded_ap, mc=mc, voiced=voiced[:, None], ) feature = feature.astype_only_float(dtype) feature.validate() return feature
def world_decompose(wav, fs, frame_period = 5.0): # Decompose speech signal into f0, spectral envelope and aperiodicity using WORLD wav = wav.astype(np.float64) f0, timeaxis = pyworld.harvest(wav, fs, frame_period = frame_period, f0_floor = 71.0, f0_ceil = 800.0) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs) ap = pyworld.d4c(wav, f0, timeaxis, fs) return f0, timeaxis, sp, ap
def wav2pw(x, fs=16000, fft_size=FFT_SIZE): ''' Extract WORLD feature from waveform ''' _f0, t = pw.dio(x, fs, f0_ceil=args.f0_ceil) # raw pitch extractor f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement sp = pw.cheaptrick(x, f0, t, fs, fft_size=fft_size) ap = pw.d4c(x, f0, t, fs, fft_size=fft_size) # extract aperiodicity return { 'f0': f0, 'sp': sp, 'ap': ap, }
def collect_features(self, wav_path, label_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) if hp_acoustic.use_harvest: f0, timeaxis = pyworld.harvest( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) else: f0, timeaxis = pyworld.dio( x, fs, frame_period=hp_acoustic.frame_period, f0_floor=hp_acoustic.f0_floor, f0_ceil=hp_acoustic.f0_ceil) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) bap = pyworld.code_aperiodicity(aperiodicity, fs) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp_acoustic.order, alpha=self.alpha) f0 = f0[:, None] lf0 = f0.copy() nonzero_indices = np.nonzero(f0) lf0[nonzero_indices] = np.log(f0[nonzero_indices]) if hp_acoustic.use_harvest: # https://github.com/mmorise/World/issues/35#issuecomment-306521887 vuv = (aperiodicity[:, 0] < 0.5).astype(np.float32)[:, None] else: vuv = (lf0 != 0).astype(np.float32) lf0 = P.interp1d(lf0, kind=hp_acoustic.f0_interpolation_kind) # Parameter trajectory smoothing if hp_acoustic.mod_spec_smoothing: hop_length = int(fs * (hp_acoustic.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing( mgc, modfs, cutoff=hp_acoustic.mod_spec_smoothing_cutoff) mgc = P.delta_features(mgc, hp_acoustic.windows) lf0 = P.delta_features(lf0, hp_acoustic.windows) bap = P.delta_features(bap, hp_acoustic.windows) features = np.hstack((mgc, lf0, vuv, bap)) # Cut silence frames by HTS alignment labels = hts.load(label_path) features = features[:labels.num_frames()] indices = labels.silence_frame_indices() features = np.delete(features, indices, axis=0) return features.astype(np.float32)
def collect_features(self, wav_path): fs, x = wavfile.read(wav_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) spectrogram = P.trim_zeros_frames(spectrogram) if self.alpha is None: self.alpha = pysptk.util.mcepalpha(fs) mgc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=self.alpha) # Drop 0-th coefficient mgc = mgc[:, 1:] # 50Hz cut-off MS smoothing hop_length = int(fs * (hp.frame_period * 0.001)) modfs = fs / hop_length mgc = P.modspec_smoothing(mgc, modfs, cutoff=50) # Add delta mgc = P.delta_features(mgc, hp.windows) return mgc.astype(np.float32)
def __call__(self, data: Wave, test=None): x = data.wave.astype(numpy.float64) fs = data.sampling_rate if self._f0_estimating_method == 'dio': _f0, t = pyworld.dio( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) else: from world4py.np import apis _f0, t = apis.harvest( x, fs, frame_period=self._frame_period, f0_floor=self._f0_floor, f0_ceil=self._f0_ceil, ) f0 = pyworld.stonemask(x, _f0, t, fs) spectrogram = pyworld.cheaptrick(x, f0, t, fs) aperiodicity = pyworld.d4c(x, f0, t, fs) mfcc = pysptk.sp2mc(spectrogram, order=self._order, alpha=self._alpha) voiced = ~(f0 == 0) # type: numpy.ndarray feature = AcousticFeature( f0=f0[:, None].astype(self._dtype), spectrogram=spectrogram.astype(self._dtype), aperiodicity=aperiodicity.astype(self._dtype), mfcc=mfcc.astype(self._dtype), voiced=voiced[:, None], ) feature.validate() return feature
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs