def __init__( self, acoustic_feature_param: AcousticFeatureParam, out_sampling_rate: int, buffer_size: int, number_of_pointers: int, ): from world4py.native import structures, apidefinitions super().__init__( acoustic_feature_param=acoustic_feature_param, out_sampling_rate=out_sampling_rate, ) self.buffer_size = buffer_size self._synthesizer = structures.WorldSynthesizer() apidefinitions._InitializeSynthesizer( self.out_sampling_rate, # sampling rate self.acoustic_feature_param.frame_period, # frame period pyworld.get_cheaptrick_fft_size(out_sampling_rate), # fft size buffer_size, # buffer size number_of_pointers, # number of pointers self._synthesizer, ) self._before_buffer = [] # for holding memory
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4, fs=16000, mge_training=True): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = hp_acoustic.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training) if post_filter: mgc = merlin_post_filter(mgc, alpha, coef=coef) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) # Convert range to int16 generated_waveform = generated_waveform / \ np.max(np.abs(generated_waveform)) * 32767 # return features as well to compare natural/genearted later return generated_waveform, mgc, lf0, vuv, bap
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, fs=16000): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = hp_acoustic.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) generated_waveform = np.clip(generated_waveform, -32768, 32768) # return features as well to compare natural/genearted later return generated_waveform, mgc, lf0, vuv, bap
def world_spectrogram(wav, sr=_sr, dim_num=32, **kwargs): """world声码器语音转为频谱。""" # 分布提取参数 frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) f0_ceil = kwargs.get("f0_ceil", pw.default_f0_ceil) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) ap_threshold = kwargs.get("ap_threshold", 0.85) f0_extractor = kwargs.get("f0_extractor", "dio") x = wav.astype(np.double) if f0_extractor == "dio": # 使用DIO算法计算音频的基频F0 f0, t = pw.dio(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil) elif f0_extractor == "harvest": f0, t = pw.harvest(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) else: f0, t = f0_extractor(x, sr, f0_floor=f0_floor, f0_ceil=f0_ceil, frame_period=frame_period) # 使用CheapTrick算法计算音频的频谱包络 sp = pw.cheaptrick(x, f0, t, sr, f0_floor=f0_floor, fft_size=fft_size) # SP降维 sp_enc = pw.code_spectral_envelope(sp, sr, number_of_dimensions=dim_num) # 计算aperiodic参数 ap = pw.d4c(x, f0, t, sr, threshold=ap_threshold, fft_size=fft_size) # AP降维 ap_enc = pw.code_aperiodicity(ap, sr) return f0, sp_enc, ap_enc
def convert(self, in_feature: AcousticFeature): input = self._encode_feature(in_feature) pad = 128 - input.shape[1] % 128 input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) with chainer.using_config('train', False): out = self.model(inputs).data[0] if self.gpu is not None: out = chainer.cuda.to_cpu(out) out = out[:, :-pad] out = self._decode_feature(out) out.ap = in_feature.ap out.voiced = in_feature.voiced out.f0[~out.voiced] = 0 fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate) sp = pysptk.mc2sp( out.mc, alpha=self._param.alpha, fftlen=fftlen, ) out.sp = sp out = out.astype_only_float(numpy.float64) return out
def world_decode_spectral_envelop(coded_sp, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) # coded_sp = coded_sp.astype(np.float32) # coded_sp = np.ascontiguousarray(coded_sp) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def gen_waveform(y_predicted, Y_mean, Y_std, post_filter=False, coef=1.4, fs=16000, mge_training=False): alpha = pysptk.util.mcepalpha(fs) fftlen = fftlen = pyworld.get_cheaptrick_fft_size(fs) frame_period = audio_world_config.frame_period # Generate parameters and split streams mgc, lf0, vuv, bap = gen_parameters(y_predicted, Y_mean, Y_std, mge_training) if post_filter: mgc = merlin_post_filter(mgc, alpha, coef=coef) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period) # Convert range to int16 # return features as well to compare natural/genearted later return generated_waveform #, mgc, lf0, vuv, bap
def world2wav(feature, frame_period): hparams = hp mgc_idx = 0 lf0_idx = mgc_idx + hparams.num_mgc vuv_idx = lf0_idx + hparams.num_lf0 bap_idx = vuv_idx + hparams.num_vuv mgc = feature[:, mgc_idx:mgc_idx + hparams.num_mgc] lf0 = feature[:, lf0_idx:lf0_idx + hparams.num_lf0] vuv = feature[:, vuv_idx:vuv_idx + hparams.num_vuv] bap = feature[:, bap_idx:bap_idx + hparams.num_bap] fs = hparams.sample_rate alpha = pysptk.util.mcepalpha(fs) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) indexes = (vuv < 0.5).flatten() bap[indexes] = np.zeros(hparams.num_bap) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), fs, fftlen) f0 = lf0.copy() f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) return pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), fs, frame_period)
def mgc_to_sp(mgc, synth_fs): fft_size = pyworld.get_cheaptrick_fft_size(synth_fs) ln_sp = pysptk.mgc2sp(np.ascontiguousarray(mgc, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size) return ln_sp
def generate_file(path): out = Path(arguments.output_directory, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave_file_load_process = WaveFileLoadProcess( sample_rate=arguments.sample_rate, top_db=arguments.top_db, pad_second=arguments.pad_second, ) wave = wave_file_load_process(path, test=True) # make acoustic feature acoustic_feature_process = AcousticFeatureProcess( frame_period=arguments.frame_period, order=arguments.order, alpha=arguments.alpha, f0_estimating_method=arguments.f0_estimating_method, ) feature = acoustic_feature_process(wave, test=True).astype_only_float(numpy.float32) high_spectrogram = feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate) low_spectrogram = pysptk.mc2sp( feature.mfcc, alpha=arguments.alpha, fftlen=fftlen, ) # save numpy.save(out.absolute(), { 'low': low_spectrogram, 'high': high_spectrogram, })
def world_decode_spectral_env(spectral_env_mel, settings): mfcc = dct(spectral_env_mel) / np.sqrt(settings['coded_dim'] * 2) fftlen = pyworld.get_cheaptrick_fft_size(settings['sample_rate']) spectral_env = pyworld.decode_spectral_envelope(mfcc, settings['sample_rate'], fftlen) return spectral_env
def world_decode_spectral_envelop(coded_sp, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) #coded_sp = coded_sp.astype(np.float32) #coded_sp = np.ascontiguousarray(coded_sp) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def decode_spectrogram(self, feature: AcousticFeature): fftlen = pyworld.get_cheaptrick_fft_size(self.out_sampling_rate) feature.sp = pysptk.mc2sp( feature.mc.astype(numpy.float32), alpha=pysptk.util.mcepalpha(self.out_sampling_rate), fftlen=fftlen, ) return feature
def fs_to_frame_length(fs): """ Convert sampling rate to frame length for STFT frame length. Code base on: Merlin's /misc/scripts/vocoder/world/extract_features_for_merlin.sh """ return pyworld.get_cheaptrick_fft_size(fs) # Better alternative.
def convertFeaturesIntoWav(f0seq, MCEPseq, APseq, fs, frame_period=5.0): contNumpy_MCEPseq = np.ascontiguousarray(MCEPseq.T, dtype=np.float64) fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pyworld.decode_spectral_envelope(contNumpy_MCEPseq, fs, fftlen) # print(f"dtypes. f0seq:{f0seq.dtype}, spectrogram:{spectrogram.dtype}, APseq:{APseq.dtype}") wav = pyworld.synthesize(f0seq, spectrogram, APseq, fs, frame_period) return wav.astype(np.float32)
def gen_waveform(labels, acoustic_features, acoustic_out_scaler, binary_dict, continuous_dict, stream_sizes, has_dynamic_features, subphone_features="coarse_coding", log_f0_conditioning=True, pitch_idx=None, num_windows=3, post_filter=True, sample_rate=48000, frame_period=5, relative_f0=True): windows = get_windows(num_windows) # Apply MLPG if necessary if np.any(has_dynamic_features): acoustic_features = multi_stream_mlpg( acoustic_features, acoustic_out_scaler.var_, windows, stream_sizes, has_dynamic_features) static_stream_sizes = get_static_stream_sizes( stream_sizes, has_dynamic_features, len(windows)) else: static_stream_sizes = stream_sizes # Split multi-stream features mgc, target_f0, vuv, bap = split_streams(acoustic_features, static_stream_sizes) # Gen waveform by the WORLD vocodoer fftlen = pyworld.get_cheaptrick_fft_size(sample_rate) alpha = pysptk.util.mcepalpha(sample_rate) if post_filter: mgc = merlin_post_filter(mgc, alpha) spectrogram = pysptk.mc2sp(mgc, fftlen=fftlen, alpha=alpha) aperiodicity = pyworld.decode_aperiodicity(bap.astype(np.float64), sample_rate, fftlen) ### F0 ### if relative_f0: diff_lf0 = target_f0 # need to extract pitch sequence from the musical score linguistic_features = fe.linguistic_features(labels, binary_dict, continuous_dict, add_frame_features=True, subphone_features=subphone_features) f0_score = _midi_to_hz(linguistic_features, pitch_idx, False)[:, None] lf0_score = f0_score.copy() nonzero_indices = np.nonzero(lf0_score) lf0_score[nonzero_indices] = np.log(f0_score[nonzero_indices]) lf0_score = interp1d(lf0_score, kind="slinear") f0 = diff_lf0 + lf0_score f0[vuv < 0.5] = 0 f0[np.nonzero(f0)] = np.exp(f0[np.nonzero(f0)]) else: f0 = target_f0 generated_waveform = pyworld.synthesize(f0.flatten().astype(np.float64), spectrogram.astype(np.float64), aperiodicity.astype(np.float64), sample_rate, frame_period) return generated_waveform
def get_sizes(sampling_rate: int, order: int): fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) return dict( f0=1, spectrogram=fft_size // 2 + 1, aperiodicity=fft_size // 2 + 1, mfcc=order + 1, voiced=1, )
def inv_world_spectrogram(f0, sp, ap, sr=_sr, **kwargs): """world声码器频谱转为语音。""" frame_period = kwargs.get("frame_period", pw.default_frame_period) f0_floor = kwargs.get("f0_floor", pw.default_f0_floor) fft_size = kwargs.get("fft_size", pw.get_cheaptrick_fft_size(sr, f0_floor)) sp_dec = pw.decode_spectral_envelope(sp, sr, fft_size=fft_size) ap_dec = pw.decode_aperiodicity(ap, sr, fft_size=fft_size) y = pw.synthesize(f0, sp_dec, ap_dec, sr, frame_period=frame_period) return y
def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): if out_sampling_rate is None: out_sampling_rate = self.config.dataset.param.voice_param.sample_rate input_feature = input input = self._feature_normalize(input, test=True) input = self._encode_feature(input, test=True) pad = 128 - input.shape[1] % 128 input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) with chainer.using_config('train', False): out = self.model(inputs).data[0] if self.gpu is not None: out = chainer.cuda.to_cpu(out) out = out[:, :-pad] out = self._decode_feature(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=input_feature.voiced, ) out = self._feature_denormalize(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=input_feature.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ) fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) spectrogram = pysptk.mc2sp( out.mfcc, alpha=self._param.acoustic_feature_param.alpha, fftlen=fftlen, ) out = AcousticFeature( f0=out.f0, spectrogram=spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ).astype(numpy.float64) return out
def get_sizes(sampling_rate: int, order: int): fft_size = pyworld.get_cheaptrick_fft_size(fs=sampling_rate) return dict( f0=1, sp=fft_size // 2 + 1, ap=fft_size // 2 + 1, mc=order + 1, voiced=1, )
def world_decode_mc(mc, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) #coded_sp = coded_sp.astype(np.float32) #coded_sp = np.ascontiguousarray(coded_sp) alpha = pysptk.util.mcepalpha(fs) sp = pysptk.conversion.mc2sp(mc, alpha, fftlen) # decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return sp
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def convert_to_feature(self, input: AcousticFeature, out_sampling_rate: Optional[int] = None): if out_sampling_rate is None: out_sampling_rate = self.config.dataset.param.voice_param.sample_rate input_feature = input input = self._feature_normalize(input, test=True) input = self._encode_feature(input, test=True) pad = 128 - input.shape[1] % 128 input = numpy.pad(input, [(0, 0), (0, pad)], mode='minimum') converter = partial(chainer.dataset.convert.concat_examples, device=self.gpu, padding=0) inputs = converter([input]) with chainer.using_config('train', False): out = self.model(inputs).data[0] if self.gpu is not None: out = chainer.cuda.to_cpu(out) out = out[:, :-pad] out = self._decode_feature(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=input_feature.voiced, ) out = self._feature_denormalize(out, test=True) out = AcousticFeature( f0=out.f0, spectrogram=out.spectrogram, aperiodicity=input_feature.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ) fftlen = pyworld.get_cheaptrick_fft_size(out_sampling_rate) spectrogram = pysptk.mc2sp( out.mfcc, alpha=self._param.acoustic_feature_param.alpha, fftlen=fftlen, ) out = AcousticFeature( f0=out.f0, spectrogram=spectrogram, aperiodicity=out.aperiodicity, mfcc=out.mfcc, voiced=out.voiced, ).astype(numpy.float64) return out
def collect_features(x, fs): fftlen = pyworld.get_cheaptrick_fft_size(fs) alpha = pysptk.util.mcepalpha(fs) order = 25 frame_period = 5 hop_length = int(fs * (frame_period * 0.001)) x = x.astype(np.float64) _f0, _timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, _f0, _timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, _timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) return mc
def __call__(self, data: Wave, test): acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype) high_spectrogram = acoustic_feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate) low_spectrogram = pysptk.mc2sp( acoustic_feature.mfcc, alpha=self._alpha, fftlen=fftlen, ) feature = LowHighSpectrogramFeature( low=low_spectrogram, high=high_spectrogram, ) feature.validate() return feature
def create_synthesizer( self, buffer_size: int, number_of_pointers: int, ): assert self._synthesizer is None self._synthesizer = structures.WorldSynthesizer() apidefinitions._InitializeSynthesizer( self.out_sampling_rate, # sampling rate self.acoustic_param.frame_period, # frame period pyworld.get_cheaptrick_fft_size( self.out_sampling_rate), # fft size buffer_size, # buffer size number_of_pointers, # number of pointers self._synthesizer, )
def __call__(self, data: Wave, test): acoustic_feature = self._acoustic_feature_process(data, test=True).astype_only_float(self._dtype) high_spectrogram = acoustic_feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(data.sampling_rate) low_spectrogram = pysptk.mc2sp( acoustic_feature.mfcc, alpha=self._alpha, fftlen=fftlen, ) feature = LowHighSpectrogramFeature( low=low_spectrogram, high=high_spectrogram, ) feature.validate() return feature
def __init__(self, config_parser): super().__init__(config_parser) self.sampling_rate = self.to_int(self.get_value('sampling_rate')) self.frame_period = self.to_int(self.get_value('frame_period')) self.has_delta = self.to_bool(self.get_value('has_delta')) if self.has_delta: self.window = [(0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0]))] else: self.window = [(0, 0, np.array([1.0]))] self.fft_length = pyworld.get_cheaptrick_fft_size(self.sampling_rate) self.alpha = pysptk.util.mcepalpha(self.sampling_rate) self.hop_length = int(self.sampling_rate * 0.001 * self.frame_period) # require [Hz] -> [kHz]
def worldDecodeSpectralEnvelop(coded_sp: np.ndarray, fs: int = SAMPLE_RATE) -> np.ndarray: ''' MCEPsをスペクトル包絡に戻す Parameters ---------- coded_sp: np.ndarray MCEPsのデータ fs: int, default SAMPLE_RATE サンプリング周波数 Returns ------- decoded_sp: np.ndarray スペクトル包絡 ''' fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
def run_world_synth(self, synth_output, hparams): """Run the WORLD synthesize method.""" fft_size = pyworld.get_cheaptrick_fft_size(hparams.synth_fs) save_dir = hparams.synth_dir if hparams.synth_dir is not None else hparams.out_dir if hparams.out_dir is not None else os.path.curdir for id_name, output in synth_output.items(): logging.info("Synthesise {} with the WORLD vocoder.".format(id_name)) coded_sp, lf0, vuv, bap = WorldFeatLabelGen.convert_to_world_features(output, contains_deltas=False, num_coded_sps=hparams.num_coded_sps) ln_sp = pysptk.mgc2sp(np.ascontiguousarray(coded_sp, dtype=np.float64), alpha=WorldFeatLabelGen.mgc_alpha, gamma=0.0, fftlen=fft_size) # sp = np.exp(sp.real * 2.0) # sp.imag = sp.imag * 180.0 / np.pi sp = np.exp(ln_sp.real) sp = np.power(sp.real / 32768.0, 2) # sp = np.power(sp.real / 32768.0, 2) # sp = np.exp(np.power(sp.real, 2)) # sp = pyworld.decode_spectral_envelope(np.ascontiguousarray(coded_sp, np.float64), self.synth_fs, fft_size) # Cepstral version. f0 = np.exp(lf0, dtype=np.float64) vuv[f0 < WorldFeatLabelGen.f0_silence_threshold] = 0 # WORLD throws an error for too small f0 values. f0[vuv == 0] = 0.0 ap = pyworld.decode_aperiodicity(np.ascontiguousarray(bap.reshape(-1, 1), np.float64), hparams.synth_fs, fft_size) waveform = pyworld.synthesize(f0, sp, ap, hparams.synth_fs) waveform = waveform.astype(np.float32, copy=False) # Does inplace conversion, if possible. # Always save as wav file first and convert afterwards if necessary. wav_file_path = os.path.join(save_dir, "{}{}{}.wav".format(os.path.basename(id_name), "_" + hparams.model_name if hparams.model_name is not None else "", hparams.synth_file_suffix, "_WORLD", ".wav")) makedirs_safe(hparams.synth_dir) soundfile.write(wav_file_path, waveform, hparams.synth_fs) # Use PyDub for special audio formats. if hparams.synth_ext.lower() != 'wav': as_wave = pydub.AudioSegment.from_wav(wav_file_path) as_wave.export(os.path.join(hparams.synth_dir, id_name + "." + hparams.synth_ext), format=hparams.synth_ext) os.remove(wav_file_path)
def generate_file(path): out = Path(arguments.output_directory, path.stem + '.npy') if out.exists() and not arguments.enable_overwrite: return # load wave and padding wave_file_load_process = WaveFileLoadProcess( sample_rate=arguments.sample_rate, top_db=arguments.top_db, pad_second=arguments.pad_second, ) wave = wave_file_load_process(path, test=True) # make acoustic feature acoustic_feature_process = AcousticFeatureProcess( frame_period=arguments.frame_period, order=arguments.order, alpha=arguments.alpha, f0_estimating_method=arguments.f0_estimating_method, ) feature = acoustic_feature_process(wave, test=True).astype_only_float( numpy.float32) high_spectrogram = feature.spectrogram fftlen = pyworld.get_cheaptrick_fft_size(arguments.sample_rate) low_spectrogram = pysptk.mc2sp( feature.mfcc, alpha=arguments.alpha, fftlen=fftlen, ) # save numpy.save(out.absolute(), { 'low': low_spectrogram, 'high': high_spectrogram, })
def __init__( self, acoustic_feature_param: AcousticFeatureParam, out_sampling_rate: int, buffer_size: int, number_of_pointers: int, ): super().__init__( acoustic_feature_param=acoustic_feature_param, out_sampling_rate=out_sampling_rate, ) self.buffer_size = buffer_size self._synthesizer = structures.WorldSynthesizer() apidefinitions._InitializeSynthesizer( self.out_sampling_rate, # sampling rate self.acoustic_feature_param.frame_period, # frame period pyworld.get_cheaptrick_fft_size(out_sampling_rate), # fft size buffer_size, # buffer size number_of_pointers, # number of pointers self._synthesizer, ) self._before_buffer = [] # for holding memory
def world_decode_spectral_envelop(coded_sp, fs): # Decode Mel-cepstral to sp fftlen = pyworld.get_cheaptrick_fft_size(fs) decoded_sp = pyworld.decode_spectral_envelope(coded_sp, fs, fftlen) return decoded_sp
from pysptk.synthesis import MLSADF, Synthesizer import librosa import librosa.display import IPython from IPython.display import Audio from os import listdir, path, makedirs, mkdir from os.path import join, expanduser DATA_ROOT = join(expanduser("~"), "Documents", "Hit", "GeneralResources", "VCProjectDate", "Training") print(DATA_ROOT) print(listdir(DATA_ROOT)) fs = 16000 fftlen = pyworld.get_cheaptrick_fft_size(fs) alpha = pysptk.util.mcepalpha(fs) order = 24 frame_period = 5 hop_length = int(fs * (frame_period * 0.001)) max_files = 530 # number of utterances to be used. test_size = 0.03 use_delta = True if use_delta: windows = [ (0, 0, np.array([1.0])), (1, 1, np.array([-0.5, 0.0, 0.5])), (1, 1, np.array([1.0, -2.0, 1.0])), ] else:
def decode_ap(ap: numpy.ndarray, sampling_rate: int): return pyworld.decode_aperiodicity( ap.astype(numpy.float64), sampling_rate, pyworld.get_cheaptrick_fft_size(sampling_rate), )
def mc2sp(mc: numpy.ndarray, sampling_rate: int, alpha: float): return pysptk.mc2sp( mc.astype(numpy.float64), alpha=alpha, fftlen=pyworld.get_cheaptrick_fft_size(sampling_rate), )
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs