def synthesize_from_MCEP(self, mcep, pitch): mcep = mcep.copy(order='C') # fixes "ndarray not C-contiguous error b = pysptk.mc2b(mcep, self.alpha) excitation = pysptk.excite(pitch.astype(np.float64), self.hop_length) x = self.synthesizer.synthesis(excitation.astype(np.float64), b.astype(np.float64)) return x
def pysptk_mfcc(self): self.frame_length = 1024 self.hop_length = 80 self.pitch = pysptk.swipe(self.audio.astype(np.float64), fs=self.sr, hopsize=self.hop_length, min=60, max=240, otype="pitch") self.source_excitation = pysptk.excite(self.pitch, self.hop_length) # Note that almost all of pysptk functions assume input array is C-contiguous and np.float4 element type frames = librosa.util.frame(self.audio, frame_length=self.frame_length, hop_length=self.hop_length).astype( np.float64).T # Windowing frames *= pysptk.blackman(self.frame_length) assert frames.shape[1] == self.frame_length # Order of mel-cepstrum self.order = 25 self.alpha = 0.41 self.mc = pysptk.mcep(frames, self.order, self.alpha) logH = pysptk.mgc2sp(self.mc, self.alpha, 0.0, self.frame_length).real librosa.display.specshow(logH.T, sr=self.sr, hop_length=self.hop_length, x_axis="time", y_axis="linear")
def mixed_excitation(f0, voicing_str, hopsize): exc_voiced = pysptk.excite(f0.astype(np.float64), hopsize=hopsize, noise=False) exc_unvoiced = 2*np.random.rand(len(exc_voiced)) - 1 exc = np.zeros(len(exc_voiced)) for i in range(5): h = h_filters[i] x_v = lfilter(h, 1, exc_voiced) x_uv = lfilter(h, 1, exc_unvoiced) gain_v = np.zeros(len(exc_voiced)) gain_uv = np.zeros(len(exc_voiced)) str_v = voicing_str[:, i] for k in range(len(str_v)): if f0[k] > 0: gain_v[k*hopsize:(k+1)*hopsize] = str_v[k] gain_uv[k*hopsize:(k+1)*hopsize] = 1.0 - str_v[k] else: gain_v[k*hopsize:(k+1)*hopsize] = 0.0 gain_uv[k*hopsize:(k+1)*hopsize] = 1.0 exc += (gain_v * x_v + gain_uv * x_uv) return exc
def source_excitation_generation(np_data, rate): pitch = ps.swipe(np_data.astype(np.float64), fs=rate, hopsize=HOP_LENGTH, min=60, max=240, otype="pitch") source_excitation = ps.excite(pitch, HOP_LENGTH) return source_excitation
def mgc_decoder_pulsenoise(pitch, mvf, mgc_coeff, resid_codebook_pca, basefilename): #print(len(pitch), len(mvf)) T0 = np.zeros(np.min([len(pitch), len(mvf)])) mvf_mean = np.mean(mvf) # print(mvf_mean) for i in range(len(T0)): if mvf[i] < 0.4 * mvf_mean: T0[i] = 0 elif pitch[i] > 0: T0[i] = Fs / pitch[i] # create source excitation using SPTK source = pysptk.excite(T0, frshft) # scale for SPTK scaled_source = np.float32(source / np.max(np.abs(source))) io_wav.write(gen_path + basefilename + '_source_pulsenoise_float32.wav', Fs, scaled_source) command = 'sox ' + gen_path + basefilename + '_source_pulsenoise_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \ 'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_pulsenoise_0.wav' ###print(command) run(command, shell=True) command = "sox -G " + gen_path + basefilename + '_synthesized_pulsenoise_0.wav' + ' ' + \ gen_path + basefilename + '_synthesized_pulsenoise.wav' ###print(command) run(command, shell=True) return [0]
frames = librosa.util.frame(x, frame_length=frame_length, hop_length=hop_length).astype(np.float64).T # Windowing frames *= pysptk.blackman(frame_length) assert frames.shape[1] == frame_length pitch = pysptk.swipe(x.astype(np.float64), fs=sr, hopsize=hop_length, min=60, max=240, otype="pitch") source_excitation = pysptk.excite(pitch, hop_length) # Order of mel-cepstrum mc = pysptk.mcep(frames, order, alpha) logH = pysptk.mgc2sp(mc, alpha, 0.0, frame_length).real print(mc.shape) #plt.plot(mc) #plotname="x_syn_coefs_" + str(order) + ".png" #plt.savefig(plotname) # Convert mel-cesptrum to MLSADF coefficients b = pysptk.mc2b(mc, alpha) synthesizer = Synthesizer(MLSADF(order=order, alpha=alpha), hop_length)
def mgc_decoder_residual_with_envelope(pitch, mvf, mgc_coeff, resid_codebook_pca, basefilename, envelope_type): # create voiced source excitation using SPTK source_voiced = pysptk.excite(Fs / pitch, frshft) # create unvoiced source excitation using SPTK pitch_unvoiced = np.zeros(len(pitch)) source_unvoiced = pysptk.excite(pitch_unvoiced, frshft) source = np.zeros(source_voiced.shape) # generate excitation frame by frame pitch synchronously for i in range(len(source)): if source_voiced[ i] > 2: # location of impulse in original impulse excitation mvf_index = int(i / frshft) mvf_curr = mvf[mvf_index] if mvf_curr > 7500: mvf_curr = 7500 # voiced component from binary codebook voiced_frame_lpf = resid_codebook_pca[int( (Fs / 2 - 0.95 * mvf_curr) / 100)] # unvoiced component by highpass filtering white noise if i + frlen < len(source_unvoiced): unvoiced_frame = source_unvoiced[i:i + len(voiced_frame_lpf)].copy() else: unvoiced_frame = source_unvoiced[i - len(voiced_frame_lpf):i].copy( ) unvoiced_frame_hpf = highpass_filter(unvoiced_frame, mvf_curr * 1.05, Fs, hpf_order) unvoiced_frame_hpf *= np.hanning(len(unvoiced_frame_hpf)) # unvoiced component multiplied with time envelope unvoiced_frame_with_envelope = unvoiced_frame.copy( ) * apply_envelope(resid_codebook_pca[0], envelope_type) unvoiced_frame_with_envelope_hpf = highpass_filter( unvoiced_frame_with_envelope, mvf_curr * 1.05, Fs, hpf_order) unvoiced_frame_with_envelope_hpf *= np.hanning( len(unvoiced_frame_with_envelope_hpf)) energy = np.linalg.norm(unvoiced_frame_with_envelope_hpf) unvoiced_frame_with_envelope_hpf /= energy # scale time envelope modulated noise by mvf unvoiced_frame_with_envelope_hpf *= (mvf_curr / 8000 * 2) # put voiced and unvoiced component to pitch synchronous location j_start = np.max((round(len(voiced_frame_lpf) / 2) - i, 0)) j_end = np.min( (len(voiced_frame_lpf), len(source) - (i - round(len(voiced_frame_lpf) / 2)))) for j in range(j_start, j_end): source[i - round(len(voiced_frame_lpf) / 2) + j] += voiced_frame_lpf[j] source[i - round(len(voiced_frame_lpf) / 2) + j] += unvoiced_frame_hpf[j] * noise_scaling source[i - round(len(voiced_frame_lpf) / 2) + j] += unvoiced_frame_with_envelope_hpf[j] # scale for SPTK scaled_source = np.float32(source / np.max(np.abs(source))) # scaled_source = np.float32(source) io_wav.write( gen_path + basefilename + '_source_' + envelope_type + '_float32.wav', Fs, scaled_source) command = 'sox ' + gen_path + basefilename + '_source_' + envelope_type + '_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \ 'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_with_' + envelope_type + '_0.wav' run(command, shell=True) command = "sox -G " + gen_path + basefilename + '_synthesized_with_' + envelope_type + '_0.wav' + ' ' + \ gen_path + basefilename + '_synthesized_with_' + envelope_type + '.wav ' + 'gain -n 0' run(command, shell=True) return [0]
def mgc_filter_residual(pitch, mvf, mgc_coeff, resid_codebook_pca, basefilename): in_wav = gen_path + basefilename + '.wav' in_raw = gen_path + basefilename + '.raw' in_mgcep = gen_path + basefilename + '.mgc' in_resid = gen_path + basefilename + '_residual_original.wav' out_resid = gen_path + basefilename + '_residual_filtered.wav' # wav -> raw command = 'sox -c 1 -e signed-integer -b 16 -t wav ' + in_wav + \ ' -c 1 -e signed-integer -b 16 -t raw -r ' + str(Fs) + ' ' + in_raw print('wav -> raw, ' + in_wav) call(command, shell=True) # raw, mgcep -> residual command = 'sptk x2x +sf ' + in_raw + ' | ' + \ 'sptk mglsadf -k -v -a ' + str(alpha) + ' -c 3 -m ' + str(order) + ' -p ' + \ str(frshft) + ' ' + in_mgcep + ' | ' + \ 'sptk x2x +fs | sox -c 1 -e signed-integer -b 16 -t raw -r ' + str(Fs) + ' - ' + \ '-c 1 -e signed-integer -b 16 -t wav -r ' + str(Fs) + ' ' + in_resid # print(command) print('raw, mgcep -> resid.wav, ' + in_wav) call(command, shell=True) (Fs_, x_residual) = io_wav.read(in_resid) plt.plot(x_residual[0:Fs], 'r') plt.show() # create voiced source excitation using SPTK source_voiced = pysptk.excite(Fs / pitch, frshft) source_upper = np.zeros(source_voiced.shape) source_lower = np.zeros(source_voiced.shape) # generate excitation frame by frame pitch synchronously for i in range(len(source_upper)): if source_voiced[ i] > 2: # location of impulse in original impulse excitation mvf_index = int(i / frshft) mvf_curr = mvf[mvf_index] T0_curr = int(Fs / pitch[mvf_index]) if i > T0_curr and i + 2 * T0_curr < len(source_upper): residual_frame = x_residual[i - T0_curr:i + T0_curr] residual_frame_upper = highpass_filter(residual_frame, mvf_curr * 1.05, Fs, hpf_order) residual_frame_upper *= np.hanning(len(residual_frame_upper)) source_upper[i - T0_curr:i + T0_curr] += residual_frame_upper residual_frame_lower = lowpass_filter(residual_frame, mvf_curr * 0.95, Fs, lpf_order) residual_frame_lower *= np.hanning(len(residual_frame_lower)) source_lower[i - T0_curr:i + T0_curr] += residual_frame_lower # ''' # upper frequency band scaled_source = np.float32(source_upper / np.max(np.abs(source_upper))) io_wav.write(gen_path + basefilename + '_residual_upper_float32.wav', Fs, scaled_source) command = 'sox ' + gen_path + basefilename + '_residual_upper_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \ 'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' ###print(command) run(command, shell=True) command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \ gen_path + basefilename + '_synthesized_based_on_residual_upper.wav' ###print(command) run(command, shell=True) # lower frequency band scaled_source = np.float32(source_lower / np.max(np.abs(source_lower))) io_wav.write(gen_path + basefilename + '_residual_lower_float32.wav', Fs, scaled_source) command = 'sox ' + gen_path + basefilename + '_residual_lower_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \ 'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' run(command, shell=True) command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \ gen_path + basefilename + '_synthesized_based_on_residual_lower.wav' run(command, shell=True) # upper and lower frequency band added together source = source_lower + source_upper scaled_source = np.float32(source / np.max(np.abs(source))) io_wav.write(gen_path + basefilename + '_residual_float32.wav', Fs, scaled_source) command = 'sox ' + gen_path + basefilename + '_residual_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'sptk mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + gen_path + basefilename + '.mgc' + ' | ' + \ 'sptk x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' ###print(command) run(command, shell=True) command = "sox -G " + gen_path + basefilename + '_synthesized_based_on_residual_0.wav' + ' ' + \ gen_path + basefilename + '_synthesized_based_on_residual.wav' run(command, shell=True) return [0]
# 音声の切り出しと窓掛け frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # 線形予測分析による線形予測符号化(LPC)係数の抽出 lpc = pysptk.lpc(frames, ORDER) lpc[:, 0] = np.log(lpc[:, 0]) # LPC係数をPARCOR係数に変換 parcor = pysptk.lpc2par(lpc) # 全極フィルタの作成 synthesizer = Synthesizer(AllPoleLatticeDF(order=ORDER), HOP_LENGTH) # 励振源信号でフィルタを駆動して音声を合成 y = synthesizer.synthesis(source_excitation, parcor) # 音声の書き込み
# 音声の切り出しと窓掛け frames = librosa.util.frame(x, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH).astype(np.float64).T frames *= pysptk.blackman(FRAME_LENGTH) # 窓掛け(ブラックマン窓) # ピッチ抽出 pitch = pysptk.swipe(x, fs=fs, hopsize=HOP_LENGTH, min=MIN_F0, max=MAX_F0, otype="pitch") # 励振源信号(声帯音源)の生成 source_excitation = pysptk.excite(pitch, HOP_LENGTH) # メルケプストラム分析(=スペクトル包絡の抽出) mc = pysptk.mcep(frames, ORDER, ALPHA) # メルケプストラム係数からMLSAディジタルフィルタ係数に変換 mlsa_coef = pysptk.mc2b(mc, ALPHA) # MLSAフィルタの作成 synthesizer = Synthesizer(MLSADF(order=ORDER, alpha=ALPHA), HOP_LENGTH) # #### 以降、合成フィルタのパラメタなどを変えて色々な音声を合成 # ### ピッチシフト (音を高くする) ### OUT_WAVE_FILE = "pitchshift_high.wav" PITCH_SHIFT = 0.5 # 音を高くする場合は 1より小さい倍率
def mgc_decoder_residual(mgc_lsp_coeff, log_f0cont, log_mvf, basefilename_out, resid_codebook_pca, Fs_codebook=16000, Fs=22050, frlen=512, frshft=200, order=24, alpha=0.42, stage=3, hpf_order=11, noise_scaling=0.04): pitch = np.float64(np.exp(log_f0cont)) mvf = np.exp(log_mvf) # create voiced source excitation using SPTK source_voiced = pysptk.excite(Fs / pitch, frshft) # create unvoiced source excitation using SPTK pitch_unvoiced = np.zeros(len(pitch)) source_unvoiced = pysptk.excite(pitch_unvoiced, frshft) source = np.zeros(source_voiced.shape) # generate excitation frame by frame pitch synchronously # voiced component for i in range(len(source)): if source_voiced[ i] > 2: # location of impulse in original impulse excitation mvf_index = int(i / frshft) mvf_curr = mvf[mvf_index] if mvf_curr > Fs_codebook / 2: mvf_curr = Fs_codebook / 2 # voiced component from residual codebook voiced_frame_lpf = resid_codebook_pca[int( (Fs_codebook / 2 - mvf_curr) / 100)] # put voiced and unvoiced component to pitch synchronous location j_start = np.max((round(len(voiced_frame_lpf) / 2) - i, 0)) j_end = np.min( (len(voiced_frame_lpf), len(source) - (i - round(len(voiced_frame_lpf) / 2)))) for j in range(j_start, j_end): source[i - round(len(voiced_frame_lpf) / 2) + j] += voiced_frame_lpf[j] # unvoiced component for i in range(len(mvf)): unvoiced_frame = source_unvoiced[i * frshft:(i + 2) * frshft].copy() mvf_curr = mvf[i] unvoiced_frame_hpf = highpass_filter(unvoiced_frame, mvf_curr * 1.2, Fs, hpf_order) unvoiced_frame_hpf *= np.hanning(len(unvoiced_frame_hpf)) source[i * frshft:(i + 2) * frshft] += unvoiced_frame_hpf * noise_scaling # scale for SPTK scaled_source = np.float32(source / np.max(np.abs(source))) io_wav.write(basefilename_out + '_source_float32.wav', Fs, scaled_source) # write files for SPTK mgc_lsp_coeff.astype('float32').tofile(basefilename_out + '.mgclsp') # MGC-LSPs -> MGC coefficients command = 'lspcheck -m ' + str(order) + ' -s ' + str(Fs / 1000) + ' -c -r 0.1 -g -G 1.0E-10 ' + basefilename_out + '.mgclsp' + ' | ' + \ 'lsp2lpc -m ' + str(order) + ' -s ' + str(Fs / 1000) + ' | ' + \ 'mgc2mgc -m ' + str(order) + ' -a ' + str(alpha) + ' -c ' + str(stage) + ' -n -u ' + \ '-M ' + str(order) + ' -A ' + str(alpha) + ' -C ' + str(stage) + ' > ' + basefilename_out + '.mgc' run(command, shell=True) command = 'sox ' + basefilename_out + '_source_float32.wav' + ' -t raw -r ' + str(Fs) + ' - ' + ' | ' + \ 'mglsadf -P 5 -m ' + str(order) + ' -p ' + str(frshft) + \ ' -a ' + str(alpha) + ' -c ' + str(stage) + ' ' + basefilename_out + '.mgc' + ' | ' + \ 'x2x +fs -o | sox -c 1 -b 16 -e signed-integer -t raw -r ' + str(Fs) + ' - -t wav -r ' + str(Fs) + ' ' + basefilename_out + '_0.wav' # print(command) run(command, shell=True) # normalize gain command = "sox --norm=-3 " + basefilename_out + '_0.wav' + ' ' + \ basefilename_out + '.wav' # print(command) run(command, shell=True) # remove temp files os.remove(basefilename_out + '_0.wav') os.remove(basefilename_out + '.mgc') os.remove(basefilename_out + '.mgclsp') os.remove(basefilename_out + '_source_float32.wav') # read file for output (Fs_out, x_synthesized) = io_wav.read(basefilename_out + '.wav') return x_synthesized