def convertWavIntoF0seqMCEPseq(wav, fs, frame_period=5.0, MCEPdim=24): """ Extract a F0 sequence and a MCEP sequence from a single waveform Args: wav (np.ndarray(1,T)): waveform fs : frame_period (float): [ms] MCEPdim (int): dimension of Mel CEPstral analysis Returns: tuple: f0seq (np.ndarray(1, T/frame_period)) & MCEPseq (np.ndarray(MCEPdim, T/frame_period)) """ wav = wav.astype(np.float64) # np.ndarray -> np.ndarray(number is float64) f0seq, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period, f0_floor=71.0, f0_ceil=800.0) spetrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs) MCEPseq = pyworld.code_spectral_envelope(spetrogram, fs, MCEPdim) print( f"F0&MCEP-nized! {wav.shape[0] / fs} [sec] wav => {f0seq.shape}, {MCEPseq.shape}" ) return f0seq, MCEPseq.T.astype(np.float32)
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs): """world声码器语音转为频谱。""" # 分布提取参数 frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) ap_threshold = kwargs.get("ap_threshold", 0.85) f0_extractor = kwargs.get("f0_extractor", "dio") x = wav.astype(np.double) if f0_extractor == "dio": # 使用DIO算法计算音频的基频F0 f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil) elif f0_extractor == "harvest": f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) else: f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) # 使用CheapTrick算法计算音频的频谱包络 sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size) # SP降维 sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num) # 计算aperiodic参数 ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size) # AP降维 ap_enc = pw.code_aperiodicity(ap, sr) return f0, sp_enc, ap_enc
def cal_mcep(wav_ori, fs=SAMPLE_RATE, ispad=False, frame_period=0.005, dim=FEATURE_DIM, fft_size=FFTSIZE): '''cal mcep given wav singnal the frame_period used only for pad_wav_to_get_fixed_frames ''' if ispad: wav, pad_length = pad_wav_to_get_fixed_frames(wav_ori, frames=FRAMES, frame_period=frame_period, sr=fs) else: wav = wav_ori #Harvest F0 extraction algorithm. f0, timeaxis = pyworld.harvest(wav, fs) #CheapTrick harmonic spectral envelope estimation algorithm. sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size) #D4C aperiodicity estimation algorithm. ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size) #feature reduction nxdim coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) #log coded_sp = coded_sp.T # dim x n res = { 'f0': f0, #n 'ap': ap, #n*fftsize//2+1 'sp': sp, #n*fftsize//2+1 'coded_sp': coded_sp, #dim * n } return res
def world_features(wav, sr, fft_size, dim): f0, timeaxis = pyworld.harvest(wav, sr) sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size) ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) coded_sp = pyworld.code_spectral_envelope(sp, sr, dim) return f0, timeaxis, sp, ap, coded_sp
def world_encode_spectral_envelop(sp, fs, dim=24): # Get Mel-cepstral coefficients (MCEPs) # sp = sp.astype(np.float64) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def wav2mcep(filepath): ''' cal mcep given wav singnal return: f0: shape [ T, ] ap: shape [ T, sampling_rate/2 + 1 ] sp: shape [ T, sampling_rate/2 + 1 ] coded_sp: shape [n_mels, T] ''' y, sr = librosa.load(filepath, sr=sampling_rate) y, _ = librosa.effects.trim(y) y = np.asarray(y, dtype=np.double) f0, timeaxis = pyworld.harvest(y, sr) sp = pyworld.cheaptrick(y, f0, timeaxis, sampling_rate, fft_size=n_fft) ap = pyworld.d4c(y, f0, timeaxis, sampling_rate, fft_size=n_fft) mcep = pyworld.code_spectral_envelope(sp, sampling_rate, n_mels) mcep = mcep.T # dim x n f0 = f0.astype(np.float64) sp = sp.astype(np.float64) ap = ap.astype(np.float64) mcep = mcep.astype(np.float64) return f0, ap, sp, mcep
def cal_mcep(wav_ori, fs, dim, fft_size): wav = wav_ori f0, timeaxis = pyworld.harvest(wav, fs) sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size) ap = pyworld.d4c(wav, f0, timeaxis, fs, fft_size=fft_size) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) coded_sp = coded_sp.T # dim x n return f0, ap, coded_sp
def world_encode_spectral_envelop(sp, fs, dim = 24): # Get Mel-cepstral coefficients (MCEPs) #sp = sp.astype(np.float64) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def world_features(wav, sr, fft_size, dim): f0, timeaxis = pyworld.harvest(wav, sr) # The fundamental period T0 of a voiced speech signal can be # defined as the elapsed time between two successive laryngeal pulses and the fundamental frequency is F0 = 1/T0 [1]. sp = pyworld.cheaptrick(wav, f0, timeaxis, sr,fft_size=fft_size) # extract smoothed spectrogram ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) # extract aperiodicity # “aperiodicity” is defined as the power ratio between the speech signal and the aperiodic component of the signal. coded_sp = pyworld.code_spectral_envelope(sp, sr, dim) return f0, timeaxis, sp, ap, coded_sp
def _process_utterance(out_dir, index, wav_path, text, phone): '''Preprocesses a single utterance audio/text pair. This writes the mel and linear scale spectrograms to disk and returns a tuple to write to the train.txt file. Args: out_dir: The directory to write the spectrograms into index: The numeric index to use in the spectrogram filenames. wav_path: Path to the audio file containing the speech input text: The text spoken in the input audio file Returns: A (spectrogram_filename, mel_filename, n_frames, text) tuple to write to train.txt ''' # Load the audio to a numpy array: wav = audio.load_wav(wav_path) if hparams.rescaling: wav = wav / np.abs(wav).max() * hparams.rescaling_max if hparams.vocoder=="world": spectrogram = audio.spectrogram(wav).astype(np.float32) f0, sp, ap = pw.wav2world(wav.astype(np.double), hparams.sample_rate) ap_coded = pw.code_aperiodicity(ap, hparams.sample_rate) sp_coded = pw.code_spectral_envelope(sp,hparams.sample_rate, hparams.coded_env_dim) world_spec = np.hstack([f0[:,np.newaxis],sp_coded,ap_coded]) n_frames = world_spec.shape[0] spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-world-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), world_spec, allow_pickle=False) else: # Compute the linear-scale spectrogram from the wav: spectrogram = audio.spectrogram(wav).astype(np.float32) n_frames = spectrogram.shape[1] # Compute a mel-scale spectrogram from the wav: mel_spectrogram = audio.melspectrogram(wav).astype(np.float32) # Write the spectrograms to disk: spectrogram_filename = 'synpaflex-spec-%05d.npy' % index encoded_filename = 'synpaflex-mel-%05d.npy' % index np.save(os.path.join(out_dir, spectrogram_filename), spectrogram.T, allow_pickle=False) np.save(os.path.join(out_dir, encoded_filename), mel_spectrogram.T, allow_pickle=False) # Return a tuple describing this training example: return (spectrogram_filename, encoded_filename, n_frames, text, phone)
def get_sp(fpath: str): # 原始人的sp wav, _ = librosa.load(fpath, sr=hp.SR, mono=True, dtype=np.float64) # librosa.load 返回音频信号值 & 采样率 # mono=False声音保持原通道数 f0, timeaxis = pw.harvest(wav, hp.SR) sp = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) # sp:频谱包络;pw.cheaptrick 谐波频谱包络估计算法 # sp = pw.cheaptrick(wav, mean_f0, timeaxis, hp.SR, fft_size=hp.N_FFT) coded_sp = pw.code_spectral_envelope(sp, hp.SR, hp.CODED_DIM) # 将频谱包络sp再 ?压缩?;返回值是:ndarray # pw.code_spectral_envelope :减小频谱包络 和 非周期性的 尺寸 。 # https://blog.csdn.net/weixin_32393347/article/details/88623256 coded_sp = coded_sp.T # ndarray 的 转置矩阵 return np.array(coded_sp)
def compute_world_cmvn(self, enable_load_from_disk, entries_person_wavs, sp_dim, fft_size, fs, speakers): """ compuate cmvn of f0 and sp using pyworld """ start = time.time() cmvns = [] for speaker in speakers: coded_sps, f0s = [], [] for audio_file in entries_person_wavs[speaker]: wav, _ = librosa.load(audio_file, sr=fs, mono=True, dtype=np.float64) # World Vocoder parameterizes speech into three components: # Pitch (fundamental frequency, F0) contour # Harmonic spectral envelope(sp) # Aperiodic spectral envelope (relative to the harmonic spectral envelope,ap) # Refer to the address:https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder if enable_load_from_disk: samples = np.load(audio_file) f0, coded_sp, ap = samples["f0"], samples[ "coded_sp"], samples["ap"] else: wav, _ = librosa.load(audio_file, sr=fs, mono=True, dtype=np.float64) f0, timeaxis = pyworld.harvest(wav, fs) # CheapTrick harmonic spectral envelope estimation algorithm. sp = pyworld.cheaptrick(wav, f0, timeaxis, fs, fft_size=fft_size) # feature reduction coded_sp = pyworld.code_spectral_envelope(sp, fs, sp_dim).T coded_sps.append(coded_sp) f0s.append(np.reshape(f0, [-1, 1])) # Calculate the mean and standard deviation of the World features coded_sps_concatenated = np.concatenate(coded_sps, axis=1) coded_sps_mean = list( np.mean(coded_sps_concatenated, axis=1, keepdims=False)) coded_sps_var = list( np.var(coded_sps_concatenated, axis=1, keepdims=False)) log_f0s_concatenated = np.ma.log(np.concatenate(f0s)) log_f0s_mean = log_f0s_concatenated.mean() log_f0s_var = log_f0s_concatenated.var() cmvns.append((speaker, coded_sps_mean, coded_sps_var, log_f0s_mean, log_f0s_var)) self.cmvn_dict[speaker] = (coded_sps_mean, coded_sps_var, \ log_f0s_mean, log_f0s_var) logging.info("finished compute cmvn, which cost %.4f s" % (time.time() - start))
def world_features(wav, sr, fft_size, dim): # pyworld.harvest用来提取音频基频F0,参数为数据和采样率,返回值为基频和每一帧时间位置,格式为ndarray数组 f0, timeaxis = pyworld.harvest(wav, sr) # pyworld.cheaptrick用来计算简单技巧下的谐波谱包络估计算法,参数为音频数据,基频FO,时间位置数组,采样率,傅里叶变换大小 # 返回值为频谱图:频谱包络(平方量级),包络即随机过程的振幅随着时间变化的曲线 sp = pyworld.cheaptrick(wav, f0, timeaxis, sr, fft_size=fft_size) # pyworld.d4c是D4C非周期性估计算法获取非周期特征AP,参数为音频数据,基频F0,时间位置数组,采样率和傅里叶变换大小 # 返回值是非周期性特征(即一个包络线,相对于频谱包络线的线性幅度) ap = pyworld.d4c(wav, f0, timeaxis, sr, fft_size=fft_size) # pyworld.code_spectral_envelope是对频谱包络的降维,参数为频谱包络,采样率与编码谱包络维数,返回编码频谱包络 coded_sp = pyworld.code_spectral_envelope(sp, sr, dim) # 返回值为基频,时间位置数组,频谱包络,非周期性特征,编码频谱包络 return f0, timeaxis, sp, ap, coded_sp
def wav2coded_sp(wav_file): # load wav x, fs = sf.read(wav_file) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) # Harmonic spectral envelope ap_h = pw.d4c(x, f0_h, t_h, fs) # Get Mel-cepstral coefficients (MCEPs) coded_sp = pw.code_spectral_envelope(sp_h, fs, 35) return coded_sp, f0_h, ap_h, x
def _encode_sp(sp, sr, mel_bins): """Conversion spectral envelope to mel spectral envelope Args: sp (numpy.float): Spectral envelope sr (int, optional): Defaults to 16000. Sampling rate mel_bins (int, optional): Defaults to 36. The number of mel bins Returns: numpy.float: mel spectral envelope """ encoded_sp = pw.code_spectral_envelope(sp, sr, mel_bins) return encoded_sp
def convertWavIntoFeatures(wav, fs, frame_period=5.0, MCEPdim=24): # basic features wav = wav.astype(np.float64) # np.ndarray -> np.ndarray(number is float64) f0seq, timeaxis = pyworld.harvest(wav, fs, frame_period=frame_period, f0_floor=71.0, f0_ceil=800.0) spectrogram = pyworld.cheaptrick(wav, f0seq, timeaxis, fs) MCEPseq = pyworld.code_spectral_envelope(spectrogram, fs, MCEPdim) APseq = pyworld.d4c(wav, f0seq, timeaxis, fs) # argumentation # print("wavIntoFeatures size") # print(f"f0seq: {f0seq.shape}, MCEPseq_before_T: {MCEPseq.shape}, APseq: {APseq.shape}") return f0seq, MCEPseq.T.astype(np.float32), APseq
def world_encode_spectral_envelop(sp, fs, dim=24): ''' SP -> MCEPs Get Mel-cepstral coefficients (MCEPs) :param sp: :param fs: :param dim: :return:s ''' # sp = sp.astype(np.float32) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def world_feature_extract(self, wav): """ World Vocoder parameterizes speech into three components: Pitch (fundamental frequency, F0) contour Harmonic spectral envelope(sp) Aperiodic spectral envelope (relative to the harmonic spectral envelope, ap) Refer to the address:https://github.com/JeremyCCHsu/Python-Wrapper-for-World-Vocoder """ f0, timeaxis = pyworld.harvest(wav, self.fs) sp = pyworld.cheaptrick(wav, f0, timeaxis, self.fs, fft_size=self.fft_size) ap = pyworld.d4c(wav, f0, timeaxis, self.fs, fft_size=self.fft_size) coded_sp = pyworld.code_spectral_envelope(sp, self.fs, self.sp_dim) f0, sp, ap, coded_sp = tf.convert_to_tensor(f0, dtype=tf.float32), \ tf.convert_to_tensor(sp, dtype=tf.float32), tf.convert_to_tensor(ap, dtype=tf.float32), \ tf.convert_to_tensor(coded_sp, dtype=tf.float32) return f0, sp, ap, coded_sp
def worldEncodeSpectralEnvelop(sp: np.ndarray, fs: int = SAMPLE_RATE, dim: int = 36) -> np.ndarray: ''' スペクトル包絡を元にMCEPsをつくる Parameters ---------- sp: np.ndarray スペクトル包絡のデータ fs: int, default SAMPLE_RATE サンプリング周波数 dim: int, default 24 iFFTの次元数 Returns ------- code_spectral_envelope: np.ndarray スペクトル包絡のMCEPs ''' coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def process_wav(wav_path): y, osr = sf.read(wav_path, subtype='PCM_16', channels=1, samplerate=48000, endian='LITTLE') # , start=56640, stop=262560) sr = 32000 y = librosa.resample(y, osr, sr) # 使用DIO算法计算音频的基频F0 _f0, t = pw.dio(y, sr, f0_floor=50.0, f0_ceil=800.0, channels_in_octave=2, frame_period=pw.default_frame_period) print(_f0.shape) # 使用CheapTrick算法计算音频的频谱包络 _sp = pw.cheaptrick(y, _f0, t, sr) code_sp = pw.code_spectral_envelope(_sp, sr, 60) print(_sp.shape, code_sp.shape) # 计算aperiodic参数 _ap = pw.d4c(y, _f0, t, sr) code_ap = pw.code_aperiodicity(_ap, sr) print(_ap.shape, code_ap.shape) np.save('data/prepared_data/f0', _f0) np.save('data/prepared_data/ap', code_ap) # 合成原始语音 synthesized = pw.synthesize(_f0 - 200, _sp, _ap, 32000, pw.default_frame_period) # 1.输出原始语音 sf.write('./data/gen_wav/test-200.wav', synthesized, 32000)
def main(args): if os.path.isdir('test'): rmtree('test') os.mkdir('test') #x, fs = sf.read('utterance/vaiueo2d.wav') x, fs = sf.read('utterance/p226_002.wav') # x, fs = librosa.load('utterance/vaiueo2d.wav', dtype=np.float64) # 1. A convient way f0, sp, ap = pw.wav2world(x, fs) # use default options y = pw.synthesize(f0, sp, ap, fs, pw.default_frame_period) # 2. Step by step # 2-1 Without F0 refinement _f0, t = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=args.frame_period, speed=args.speed) _sp = pw.cheaptrick(x, _f0, t, fs) _ap = pw.d4c(x, _f0, t, fs) _y = pw.synthesize(_f0, _sp, _ap, fs, args.frame_period) # librosa.output.write_wav('test/y_without_f0_refinement.wav', _y, fs) sf.write('test/y_without_f0_refinement.wav', _y, fs) # 2-2 DIO with F0 refinement (using Stonemask) f0 = pw.stonemask(x, _f0, t, fs) sp = pw.cheaptrick(x, f0, t, fs) ap = pw.d4c(x, f0, t, fs) y = pw.synthesize(f0, sp, ap, fs, args.frame_period) # librosa.output.write_wav('test/y_with_f0_refinement.wav', y, fs) sf.write('test/y_with_f0_refinement.wav', y, fs) # 2-3 Harvest with F0 refinement (using Stonemask) _f0_h, t_h = pw.harvest(x, fs) f0_h = pw.stonemask(x, _f0_h, t_h, fs) sp_h = pw.cheaptrick(x, f0_h, t_h, fs) ap_h = pw.d4c(x, f0_h, t_h, fs) y_h = pw.synthesize(f0_h, sp_h, ap_h, fs, pw.default_frame_period) # librosa.output.write_wav('test/y_harvest_with_f0_refinement.wav', y_h, fs) sf.write('test/y_harvest_with_f0_refinement.wav', y_h, fs) # 2-4 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. code_sp = pw.code_spectral_envelope(sp, fs, 80) code_ap = pw.code_aperiodicity(ap, fs) fft_size = (sp.shape[1] - 1) * 2 rest_sp = pw.decode_spectral_envelope(code_sp, fs, fft_size) rest_ap = pw.decode_aperiodicity(code_ap, fs, fft_size) y_r = pw.synthesize(f0, rest_sp, rest_ap, fs, args.frame_period) sf.write('test/y_with_f0_refinement_code_and_restore.wav', y_r, fs) print("fft size: {:d}".format(fft_size)) print("coded sp shape: ({:d}, {:d})".format(code_sp.shape[0], code_sp.shape[1])) print("coded ap shape: ({:d}, {:d})".format(code_ap.shape[0], code_ap.shape[1])) # 2-5 DIO with F0 refinement (using Stonemask). Code and restore sp, ap. frame_shift: 12.5 ms, frame_length: 50.0 ms f0_xx, t_xx = pw.dio(x, fs, f0_floor=50.0, f0_ceil=600.0, channels_in_octave=2, frame_period=12.5, speed=args.speed) f0_xx = pw.stonemask(x, f0_xx, t_xx, fs) sp_xx = pw.cheaptrick(x, f0_xx, t_xx, fs) ap_xx = pw.d4c(x, f0_xx, t_xx, fs) code_sp_xx = pw.code_spectral_envelope(sp_xx, fs, 80) code_ap_xx = pw.code_aperiodicity(ap_xx, fs) fft_size = (sp_xx.shape[1] - 1) * 2 rest_sp_xx = pw.decode_spectral_envelope(code_sp_xx, fs, fft_size) rest_ap_xx = pw.decode_aperiodicity(code_ap_xx, fs, fft_size) y_r_xx = pw.synthesize(f0_xx, rest_sp_xx, rest_ap_xx, fs, 12.5) sf.write( 'test/y_with_f0_refinement_code_and_restore_frame_period_12.5.wav', y_r_xx, fs) print("coded sp_xx shape: ({:d}, {:d})".format(code_sp_xx.shape[0], code_sp_xx.shape[1])) print("coded ap_xx shape: ({:d}, {:d})".format(code_ap_xx.shape[0], code_ap_xx.shape[1])) # Comparison savefig('test/wavform.png', [x, _y, y, y_h, y_r, y_r_xx]) savefig('test/sp.png', [_sp, sp, sp_h, rest_sp, rest_sp_xx]) savefig('test/ap.png', [_ap, ap, ap_h, rest_ap, rest_ap_xx], log=False) savefig('test/f0.png', [_f0, f0, f0_h, f0_xx]) print('Please check "test" directory for output files')
def encode_spectral_envelop(spect, sampling_rate, dim=24): coded_spect = pyworld.code_spectral_envelope(spect, sampling_rate, dim) return coded_spect
#f0 = pw.stonemask(x, _f0, t, fs) # pitch refinement f0, t = pw.harvest(x, fs) sp = pw.cheaptrick(x, f0, t, fs) # extract smoothed spectrogram ap = pw.d4c(x, f0, t, fs) # extract aperiodicity end = timer() print('Feature Extraction:', end - start, 'seconds') # f0_new from copy import deepcopy # to avoid call by reference!! f0_new = deepcopy(f0) # 1-58 59-138 139-198 // 269-360 // 429-522 f0_new[1:198] = np.flip(f0_new[1:198], 0) # reverse pitch f0_new[269:360] = f0_new[269:360] + 62 #E(330hz) -> G (392hz) f0_new[429:522] = f0_new[429:522] + 193 #E(330hz) -> G(523hz) #%% reduce dimension of spectral envelope and aperiodicity. enc_sp = pw.code_spectral_envelope(sp, fs, number_of_dimensions=32) dec_sp = pw.decode_spectral_envelope(enc_sp, fs, fft_size=(sp.shape[1] - 1) * 2) enc_ap = pw.code_aperiodicity(ap, fs) dec_ap = pw.decode_aperiodicity(enc_ap, fs, fft_size=(ap.shape[1] - 1) * 2) #%% y = pw.synthesize(f0, sp, ap, fs) librosa.output.write_wav('y_EyesNose_short_resynthesis.wav', y, fs) #%% y = pw.synthesize(f0, dec_sp, ap, fs) librosa.output.write_wav('y_EyesNose_short_resynthesis_sp_decode_32.wav', y, fs)
def world_encode_spectral_envelop(sp, fs, dim=24): # 对频谱包络进行降维处理,下降后的维度为dim=24 coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
for file in source_dir.glob(f"**/*.{extension}"): print(f"{file}") cnt += 1 N = None fs, signal = load_function(str(file)) signal = signal.astype(np.float) samplerate = args.samplerate or fs # if the samplerate is specified, use specified one, and else, use the wave files one. signal = downsampling(signal, fs, samplerate) # fs -> samplerate signal = convert2mono(signal) _f0, t = pw.dio(signal, samplerate, frame_period=args.frame_period*1000) # 基本周波数の抽出 # _f0, t = pw.harvest(signal, samplerate, frame_period=args.frame_period*1000) # 基本周波数の抽出 f0 = pw.stonemask(signal, _f0, t, samplerate) # 基本周波数の修正 sp = pw.cheaptrick(signal, f0, t, samplerate, fft_size=args.fftsize) # スペクトル包絡spectrumの抽出 ap = pw.d4c(signal, f0, t, samplerate, fft_size=args.fftsize) # 非周期性指標の抽出 mcep = pw.code_spectral_envelope(sp, samplerate, args.nfilt) # メルケプストラムの抽出 N = mcep.shape[0] if "spenv" in feature_type: np.savetxt(file.with_suffix(".spenv"), sp) if "mcep" in feature_type: np.savetxt(file.with_suffix(".mcep"), mcep) if "f0" in feature_type: np.savetxt(file.with_suffix(".f0"), f0) if "ap" in feature_type: np.savetxt(file.with_suffix(".ap"), ap)
def world_encode_spectral_envelop(sp, fs, dim=24): # Get Mel-Cepstral coefficients (MCEPs) coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def get_sp(fpath: str): wav, _ = librosa.load(fpath, sr=hp.SR, mono=True, dtype=np.float64) f0, timeaxis = pw.harvest(wav, hp.SR, frame_period=10) sp = pw.cheaptrick(wav, f0, timeaxis, hp.SR, fft_size=hp.N_FFT) coded_sp = pw.code_spectral_envelope(sp, hp.SR, hp.CODED_DIM) return np.array(coded_sp)
def world_encode_spectral_env(spectral_env, settings): mfcc = pyworld.code_spectral_envelope(spectral_env, settings['sample_rate'], settings['coded_dim']) return idct(mfcc) / np.sqrt(settings['coded_dim'] * 2)
import pyworld IN_WAVE_FILE = "in.wav" # 入力音声 OUT_WAVE_FILE = "out.wav" # 分析再合成した音声 SP_DIM = 50 # スペクトル包絡の圧縮後の次元 # 音声の読み込み fs, x = wavfile.read(IN_WAVE_FILE) x = x.astype(np.float64) # 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) f0, sp, ap = pyworld.wav2world(x, fs) fft_size = pyworld.get_cheaptrick_fft_size(fs) # スペクトル包絡をエンコード / デコード # https://www.isca-speech.org/archive/Interspeech_2017/abstracts/0067.html code_sp = pyworld.code_spectral_envelope(sp, fs, SP_DIM) decode_sp = pyworld.decode_spectral_envelope(code_sp, fs, fft_size) # 非周期性指標をエンコード / デコード code_ap = pyworld.code_aperiodicity(ap, fs) decode_ap = pyworld.decode_aperiodicity(code_ap, fs, fft_size) # 音声の再合成 y = pyworld.synthesize(f0, decode_sp, decode_ap, fs) y = y.astype(np.int16) # 音声の書き込み wavfile.write(OUT_WAVE_FILE, fs, y)
def world_encode_spectral_envelop(sp, fs, dim=36): coded_sp = pyworld.code_spectral_envelope(sp, fs, dim) return coded_sp
def gen_waveform(labels, acoustic_features, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): static_stream_sizes = get_static_stream_sizes(stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) # fill aperiodicity with ones for unvoiced regions aperiodicity[vuv.reshape(-1) < 0.5, :] = 1.0 # WORLD fails catastrophically for out of range aperiodicity aperiodicity = np.clip(aperiodicity, 0.0, 1.0) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features( labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) # 音量を小さくする(音割れ防止) # TODO: ここのかける定数をいい感じにする spectrogram *= 0.000000001 sp = pyworld.code_spectral_envelope(spectrogram, sample_rate, 60) return f0, sp, bap, generated_waveform