def merlin_post_filter(mgc, alpha, minimum_phase_order=511, fftlen=2048, coef=1.4, weight=None): _, D = mgc.shape if weight is None: weight = np.ones(D) * coef weight[:2] = 1 assert len(weight) == D mgc_r0 = pysptk.c2acr(pysptk.freqt(mgc, minimum_phase_order, alpha=-alpha), 0, fftlen).flatten() mgc_p_r0 = pysptk.c2acr( pysptk.freqt(mgc * weight, minimum_phase_order, -alpha), 0, fftlen).flatten() mgc_b0 = pysptk.mc2b(weight * mgc, alpha)[:, 0] mgc_p_b0 = np.log(mgc_r0 / mgc_p_r0) / 2 + mgc_b0 mgc_p_mgc = pysptk.b2mc( np.hstack((mgc_p_b0[:, None], pysptk.mc2b(mgc * weight, alpha)[:, 1:])), alpha) return mgc_p_mgc
def synthesize_from_MCEP(self, mcep, pitch): mcep = mcep.copy(order='C') # fixes "ndarray not C-contiguous error b = pysptk.mc2b(mcep, self.alpha) excitation = pysptk.excite(pitch.astype(np.float64), self.hop_length) x = self.synthesizer.synthesis(excitation.astype(np.float64), b.astype(np.float64)) return x
def test_merlin_post_filter(): root = join(DATA_DIR, "merlin_post_filter") mgc = np.fromfile(join(root, "arctic_b0539.mgc"), dtype=np.float32).reshape(-1, 60) weight = np.fromfile(join(root, "weight"), dtype=np.float32) alpha = 0.58 minimum_phase_order = 511 fftlen = 1024 coef = 1.4 # Step 1 mgc_r0 = np.fromfile(join(root, "arctic_b0539.mgc_r0"), dtype=np.float32) mgc_r0_hat = pysptk.c2acr(pysptk.freqt( mgc, minimum_phase_order, alpha=-alpha), 0, fftlen).flatten() assert np.allclose(mgc_r0, mgc_r0_hat) # Step 2 mgc_p_r0 = np.fromfile( join(root, "arctic_b0539.mgc_p_r0"), dtype=np.float32) mgc_p_r0_hat = pysptk.c2acr(pysptk.freqt( mgc * weight, minimum_phase_order, -alpha), 0, fftlen).flatten() assert np.allclose(mgc_p_r0, mgc_p_r0_hat) # Step 3 mgc_b0 = np.fromfile(join(root, "arctic_b0539.mgc_b0"), dtype=np.float32) mgc_b0_hat = pysptk.mc2b(weight * mgc, alpha)[:, 0] assert np.allclose(mgc_b0, mgc_b0_hat) # Step 4 mgc_p_b0 = np.fromfile( join(root, "arctic_b0539.mgc_p_b0"), dtype=np.float32) mgc_p_b0_hat = np.log(mgc_r0_hat / mgc_p_r0_hat) / 2 + mgc_b0_hat assert np.allclose(mgc_p_b0, mgc_p_b0_hat) # Final step mgc_p_mgc = np.fromfile( join(root, "arctic_b0539.mgc_p_mgc"), dtype=np.float32).reshape(-1, 60) mgc_p_mgc_hat = pysptk.b2mc( np.hstack((mgc_p_b0_hat[:, None], pysptk.mc2b(mgc * weight, alpha)[:, 1:])), alpha) assert np.allclose(mgc_p_mgc, mgc_p_mgc_hat) filtered_mgc = merlin_post_filter(mgc, alpha, coef=coef, weight=weight, minimum_phase_order=minimum_phase_order, fftlen=fftlen) assert np.allclose(filtered_mgc, mgc_p_mgc, atol=1e-6)
def spec_to_waveform(spectrogram, order, fs, frame_period): alpha = pysptk.util.mcepalpha(fs) hop_length = int(fs * (frame_period * 0.001)) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) return waveform
def gaussian_voice_conversion(model, audio_path, windows=default_windows, frame_period=default_frame_period, order=default_order, alpha=default_alpha, hop_length=default_hop_length): paramgen = utilities.math.MLPG(model, windows=windows, diff=True) sampling_rate, audio_data = scipy.io.wavfile.read(audio_path) audio_data = audio_data.astype(numpy.float64) # fundamental_frequency, time_axis = pyworld.dio(audio_data, sampling_rate, frame_period=frame_period) fundamental_frequency = pyworld.stonemask(audio_data, fundamental_frequency, time_axis, sampling_rate) spectrogram = pyworld.cheaptrick(audio_data, fundamental_frequency, time_axis, sampling_rate) aperiodicity = pyworld.d4c(audio_data, fundamental_frequency, time_axis, sampling_rate) # mel_coefficients = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mel_coefficients = mel_coefficients[:, 0], mel_coefficients[:, 1:] mel_coefficients = utilities.math.apply_delta(mel_coefficients, windows) mel_coefficients = paramgen.transform(mel_coefficients) mel_coefficients = numpy.hstack((c0[:, None], mel_coefficients)) # mel_coefficients[:, 0] = 0 engine = pysptk.synthesis.Synthesizer(pysptk.synthesis.MLSADF(order=order, alpha=alpha), hopsize=hop_length) mlsa_coefficients = pysptk.mc2b(mel_coefficients.astype(numpy.float64), alpha=alpha) waveform = engine.synthesis(audio_data, mlsa_coefficients) # The numpy.int16 is really important, otherwise it would # produce non-sensical wavefiles when saved with scipy return numpy.asarray(waveform, dtype=numpy.int16)
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def pysptk_imfcc(self): from pysptk.synthesis import MLSADF, Synthesizer # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(self.mc, self.alpha) synthesizer = Synthesizer(MLSADF(order=self.order, alpha=self.alpha), self.hop_length) x_synthesized = synthesizer.synthesis(self.source_excitation, b) librosa.display.waveplot(x_synthesized, sr=self.sr) a = 0
def convert_mcep_to_mlsa_coef(avg_mcep, mag, alpha): """CONVERT AVERAGE MEL-CEPTSRUM TO MLSA FILTER COEFFICIENT. Args: avg_mcep (ndarray): Averaged Mel-cepstrum (D,). mag (float): Magnification of noise shaping. alpha (float): All pass constant value. Return: ndarray: MLSA filter coefficient (D,). """ avg_mcep *= mag avg_mcep[0] = 0.0 coef = pysptk.mc2b(avg_mcep.astype(np.float64), alpha) assert np.isfinite(coef).all() return coef
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True): # GMM-based parameter generation is provided by the library in `baseline` module if disable_mlpg: # Force disable MLPG paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) fs, x = wavfile.read(src_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) pdb.set_trace() mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period) return waveform
def apply_mlsa_filter(wav, mcep): if mcep.fs > wav.fs: mcep = kwiiyatta.resample(mcep, wav.fs) elif mcep.fs < wav.fs: spec = kwiiyatta.Synthesizer.resample_spectrum_envelope( mcep.extract_spectrum(), mcep.fs, wav.fs ) cutoff = mcep.fs*spec.shape[1]//wav.fs spec[:, cutoff:] = np.tile(np.atleast_2d(spec[:, cutoff-1]).T, spec.shape[-1]-cutoff) mcep = kwiiyatta.MelCepstrum(wav.fs, mcep.frame_period) mcep.extract(spec) # remove power coefficients mc = np.hstack((np.zeros((mcep.data.shape[0], 1)), mcep.data[:, 1:])) alpha = mcep.alpha() engine = Synthesizer(MLSADF(order=mcep.order, alpha=alpha), hopsize=int(mcep.fs * (mcep.frame_period * 0.001))) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(wav.data, b) return kwiiyatta.Wavdata(wav.fs, waveform)
def __test_synthesis(filt): # dummy source excitation source = __dummy_source() hopsize = 80 # dummy filter coef. windowed = __dummy_windowed_frames(source, frame_len=512, hopsize=hopsize) mc = pysptk.mcep(windowed, filt.order, filt.alpha) b = pysptk.mc2b(mc, filt.alpha) # synthesis synthesizer = Synthesizer(filt, hopsize) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y)) # transpose synthesizer = Synthesizer(filt, hopsize, transpose=True) y = synthesizer.synthesis(source, b) assert np.all(np.isfinite(y))
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True): if disable_mlpg: paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) x, fs_ = sf.read(path_src) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) aperiodicity = pyworld.d4c(x, f0, time_axis, fs_) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_, frame_period) return waveform
pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha) synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav" #wavfile.write("x.wav", sr, x) wavfile.write(filenam, sr, x_synthesized) time_total = time.time() - start writestring = str(order) + "," + str(time_total) + "\n" f.write(writestring)
# Parameter generation paramgen = MLPG(gmm, windows=windows, diff=True) # Waveform generation for test set for idx, path in enumerate(source.test_paths): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) # aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = delta_features(mc, windows) since = time.time() mc = paramgen.transform(mc) print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since)) assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=80) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) if not exists('resultsVC'): os.makedirs('resultsVC') wavfile.write( "resultsVC/{}_{}.wav".format(splitext(basename(path))[0], 'mlpg'), fs, waveform.astype(np.int16))
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) _, sp, _ = pyworld.wav2world(x, fs) # メルケプストラム係数の抽出 from WORLDのスペクトル包絡 mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mcep, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # 励振源信号でMLSAフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, mlsa_coef) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
def synthesis_mel_cepstrum(mc, source_excitation): b = ps.mc2b(mc, ALPHA) synthesizer = ps.synthesis.Synthesizer( ps.synthesis.MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) synthesized = synthesizer.synthesis(source_excitation, b) return synthesized
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg( y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale( mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp( mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize( f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs