def spec_to_waveform(spectrogram, order, fs, frame_period): alpha = pysptk.util.mcepalpha(fs) hop_length = int(fs * (frame_period * 0.001)) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) return waveform
def test_vc_from_path(model, path, data_mean, data_std, diffvc=True): model.eval() fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) # Apply model mc_scaled = Variable(torch.from_numpy(mc_scaled)) R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) y_hat, y_hat_static = model(mc_scaled, R) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
def pysptk_imfcc(self): from pysptk.synthesis import MLSADF, Synthesizer # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(self.mc, self.alpha) synthesizer = Synthesizer(MLSADF(order=self.order, alpha=self.alpha), self.hop_length) x_synthesized = synthesizer.synthesis(self.source_excitation, b) librosa.display.waveplot(x_synthesized, sr=self.sr) a = 0
def synthesize(self, pitch, mc, unnormalize=False): if unnormalize and self.mean != None: for tt in range(len(pitch)): for ii in range(self.num_params + 1): mc[tt][ii] = mc[tt][ii] * self.stdev[ii] + self.mean[ ii] #(mc[tt][ii]-self.mean[ii]) / self.stdev[ii] mc = np.asarray(mc, dtype=np.float64) pitch = np.asarray(pitch, dtype=np.float64) #print mc.shape #print pitch.shape b = sptk.mc2b(mc, self.alpha) synthesizer = Synthesizer( MLSADF(order=self.num_params, alpha=self.alpha), self.frame_len * self.sample_rate / 1000) source_excitation = sptk.excite( pitch, self.frame_len * self.sample_rate / 1000) x_synthesized = synthesizer.synthesis(source_excitation, b) return x_synthesized
def synthesis_diff(x, diffmcep, rmcep=None, alpha=MCEP_ALPHA, fs=FS, shiftms=SHIFTMS): """filtering with a differential mel-cesptrum Parameters ---------- x : array, shape (`samples`) array of waveform sequence diffmcep : array, shape (`T`, `dim`) array of differential mel-cepstrum sequence rmcep : array, shape (`T`, `dim`) array of reference mel-cepstrum sequence Default set to None alpha : float, optional Parameter of all-path transfer function Default set to 0.42 Return ---------- wav: array, shape (`samples`) Synethesized waveform """ x = x.astype(np.float64) dim = diffmcep.shape[1] - 1 shiftl = int(fs / 1000 * shiftms) if rmcep is not None: # power modification diffmcep = mod_power(rmcep + diffmcep, rmcep, alpha=alpha) - rmcep b = np.apply_along_axis(ps.mc2b, 1, diffmcep, alpha) assert np.isfinite(b).all() mlsa_fil = ps.synthesis.Synthesizer(MLSADF(dim, alpha=alpha), shiftl) wav = mlsa_fil.synthesis(x, b) return wav
def test_one_utt(src_path, tgt_path, disable_mlpg=False, diffvc=True): # GMM-based parameter generation is provided by the library in `baseline` module if disable_mlpg: # Force disable MLPG paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) fs, x = wavfile.read(src_path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) pdb.set_trace() mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, frame_period) return waveform
def apply_mlsa_filter(wav, mcep): if mcep.fs > wav.fs: mcep = kwiiyatta.resample(mcep, wav.fs) elif mcep.fs < wav.fs: spec = kwiiyatta.Synthesizer.resample_spectrum_envelope( mcep.extract_spectrum(), mcep.fs, wav.fs ) cutoff = mcep.fs*spec.shape[1]//wav.fs spec[:, cutoff:] = np.tile(np.atleast_2d(spec[:, cutoff-1]).T, spec.shape[-1]-cutoff) mcep = kwiiyatta.MelCepstrum(wav.fs, mcep.frame_period) mcep.extract(spec) # remove power coefficients mc = np.hstack((np.zeros((mcep.data.shape[0], 1)), mcep.data[:, 1:])) alpha = mcep.alpha() engine = Synthesizer(MLSADF(order=mcep.order, alpha=alpha), hopsize=int(mcep.fs * (mcep.frame_period * 0.001))) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(wav.data, b) return kwiiyatta.Wavdata(wav.fs, waveform)
def test_one_utt(path_src, path_tgt, disable_mlpg=False, diffvc=True): if disable_mlpg: paramgen = MLPG(gmm, windows=[(0, 0, np.array([1.0]))], diff=diffvc) else: paramgen = MLPG(gmm, windows=windows, diff=diffvc) x, fs_ = sf.read(path_src) x = x.astype(np.float64) f0, time_axis = pyworld.dio(x, fs_, frame_period=frame_period) f0 = pyworld.stonemask(x, f0, time_axis, fs_) spectrogram = pyworld.cheaptrick(x, f0, time_axis, fs_) aperiodicity = pyworld.d4c(x, f0, time_axis, fs_) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] if use_delta: mc = delta_features(mc, windows) mc = paramgen.transform(mc) if disable_mlpg and mc.shape[-1] != static_dim: mc = mc[:, :static_dim] assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) if diffvc: mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs_, frame_period) return waveform
def __test(order, alpha, pd): __test_synthesis(MLSADF(order, alpha, pd=pd))
datalist = [] with open("conf/eval.list", "r") as f: for line in f: line = line.rstrip() datalist.append(line) for i in range(0, len(datalist)): outfile = "result/wav/{}_diff.wav".format(datalist[i]) with open("data/SF-TF/mgc/{}.mgc".format(datalist[i]), "rb") as f: conv_mgc = np.fromfile(f, dtype="<f8", sep="") conv_mgc = conv_mgc.reshape(len(conv_mgc) // dim, dim) with open("data/SF/mgc/{}.mgc".format(datalist[i]), "rb") as f: src_mgc = np.fromfile(f, dtype="<f8", sep="") src_mgc = src_mgc.reshape(len(src_mgc) // dim, dim) fs, data = wavfile.read("data/SF/wav/{}.wav".format( datalist[i])) # 入力音声そのものをもってくる data = data.astype(np.float) diff_mgc = conv_mgc - src_mgc # 差分のフィルタを用意する diff_mgc = np.zeros(shape=conv_mgc.shape) # 差分のフィルタを入力音声波形に適用する b = np.apply_along_axis(sptk.mc2b, 1, diff_mgc, alpha) synthesizer = Synthesizer(MLSADF(order=dim - 1, alpha=alpha), 80) owav = synthesizer.synthesis(data, b) owav = np.clip(owav, -32768, 32767) wavfile.write(outfile, fs, owav.astype(np.int16))
pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha) synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length) x_synthesized = synthesizer.synthesis(source_excitation, b) filenam = "synthesized_sounds/" + "x_syn" + str(order + 1) + ".wav" #wavfile.write("x.wav", sr, x) wavfile.write(filenam, sr, x_synthesized) time_total = time.time() - start writestring = str(order) + "," + str(time_total) + "\n" f.write(writestring)
def __test_invalid_pade(pd): MLSADF(20, pd=pd)
def __test(order, alpha): __test_synthesis(MLSADF(order, alpha))
def test_vc_from_path(model, x, fs, data_mean, data_std, diffvc=True): model.eval() hop_length = int(fs * (hp.frame_period * 0.001)) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=hp.frame_period) f0 = pyworld.stonemask(x, f0, timeaxis, fs) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) alpha = pysptk.util.mcepalpha(fs) mc = pysptk.sp2mc(spectrogram, order=hp.order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] static_dim = mc.shape[-1] mc = P.modspec_smoothing(mc, fs / hop_length, cutoff=50) mc = P.delta_features(mc, hp.windows).astype(np.float32) T = mc.shape[0] inputs = mc[:, :static_dim].copy() # Normalization mc_scaled = P.scale(mc, data_mean, data_std) mc_scaled = Variable(torch.from_numpy(mc_scaled)) lengths = [len(mc_scaled)] # Add batch axis mc_scaled = mc_scaled.view(1, -1, mc_scaled.size(-1)) # For MLPG R = unit_variance_mlpg_matrix(hp.windows, T) R = torch.from_numpy(R) # Apply model if model.include_parameter_generation(): # Case: models include parameter generation in itself # Mulistream features cannot be used in this case y_hat, y_hat_static = model(mc_scaled, R, lengths=lengths) else: # Case: generic models (can be sequence model) assert hp.has_dynamic_features is not None y_hat = model(mc_scaled, lengths=lengths) y_hat_static = multi_stream_mlpg(y_hat, R, hp.stream_sizes, hp.has_dynamic_features) mc_static_pred = y_hat_static.data.cpu().numpy().reshape(-1, static_dim) # Denormalize mc_static_pred = P.inv_scale(mc_static_pred, data_mean[:static_dim], data_std[:static_dim]) outputs = mc_static_pred.copy() if diffvc: mc_static_pred = mc_static_pred - mc[:, :static_dim] mc = np.hstack((c0[:, None], mc_static_pred)) if diffvc: mc[:, 0] = 0 # remove power coefficients engine = Synthesizer(MLSADF(order=hp.order, alpha=alpha), hopsize=hop_length) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) else: fftlen = pyworld.get_cheaptrick_fft_size(fs) spectrogram = pysptk.mc2sp(mc.astype(np.float64), alpha=alpha, fftlen=fftlen) waveform = pyworld.synthesize(f0, spectrogram, aperiodicity, fs, hp.frame_period) return waveform, inputs, outputs
# Parameter generation paramgen = MLPG(gmm, windows=windows, diff=True) # Waveform generation for test set for idx, path in enumerate(source.test_paths): fs, x = wavfile.read(path) x = x.astype(np.float64) f0, timeaxis = pyworld.dio(x, fs, frame_period=frame_period) spectrogram = pyworld.cheaptrick(x, f0, timeaxis, fs) # aperiodicity = pyworld.d4c(x, f0, timeaxis, fs) mc = pysptk.sp2mc(spectrogram, order=order, alpha=alpha) c0, mc = mc[:, 0], mc[:, 1:] mc = delta_features(mc, windows) since = time.time() mc = paramgen.transform(mc) print("{}, Elapsed time in conversion: {}s".format(idx, time.time() - since)) assert mc.shape[-1] == static_dim mc = np.hstack((c0[:, None], mc)) mc[:, 0] = 0 engine = Synthesizer(MLSADF(order=order, alpha=alpha), hopsize=80) b = pysptk.mc2b(mc.astype(np.float64), alpha=alpha) waveform = engine.synthesis(x, b) if not exists('resultsVC'): os.makedirs('resultsVC') wavfile.write( "resultsVC/{}_{}.wav".format(splitext(basename(path))[0], 'mlpg'), fs, waveform.astype(np.int16))
def build_synth(self): self.synthesizer = Synthesizer( MLSADF(order=self.order, alpha=self.alpha), self.hop_length)
# 音声の分析 (基本周波数、スペクトル包絡、非周期性指標) _, sp, _ = pyworld.wav2world(x, fs) # メルケプストラム係数の抽出 from WORLDのスペクトル包絡 mcep = pysptk.sp2mc(sp, order=ORDER, alpha=ALPHA) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mcep, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # 励振源信号でMLSAフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, mlsa_coef) # 音声の書き込み y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y)
hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム分析(=スペクトル包絡の抽出) mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mc, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成 # ### ピッチシフト (音を高くする) ### OUT_WAVE_FILE = "pitchshift_high.wav" PITCH_SHIFT = 0.5 # 音を高くする場合は 1より小さい倍率 excitation_pitchhigh = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH) y = synthesizer.synthesis(excitation_pitchhigh, mlsa_coef) # 音声合成 y = y.astype(np.int16) wavfile.write(OUT_WAVE_FILE, fs, y) # ### ピッチシフト (音を低くする) ### OUT_WAVE_FILE = "pitchshift_low.wav" PITCH_SHIFT = 1.5 # 音を低くする場合は 1より大きい倍率 excitation_pitchlow = pysptk.excite(pitch * PITCH_SHIFT, HOP_LENGTH)