def SpectralSub(signal, wlen, inc, NIS, a, b): """ 谱减法滤波 :param signal: :param wlen: :param inc: :param NIS: :param a: :param b: :return: """ wnd = np.hamming(wlen) y = enframe(signal, wnd, inc) fn, flen = y.shape y_a = np.abs(np.fft.fft(y, axis=1)) y_a2 = np.power(y_a, 2) y_angle = np.angle(np.fft.fft(y, axis=1)) Nt = np.mean(y_a2[:NIS, ], axis=0) y_a2 = np.where(y_a2 >= a * Nt, y_a2 - a * Nt, b * Nt) X = y_a2 * np.cos(y_angle) + 1j * y_a2 * np.sin(y_angle) hatx = np.real(np.fft.ifft(X, axis=1)) sig = np.zeros(int((fn - 1) * inc + wlen)) for i in range(fn): start = i * inc sig[start:start + flen] += hatx[i, :] return sig
def Nmfcc(x, fs, p, frameSize, inc): """ 计算mfcc系数 :param x: 输入信号 :param fs: 采样率 :param p: Mel滤波器组的个数 :param frameSize: 分帧的每帧长度 :param inc: 帧移 :return: """ # 预处理-预加重 xx = lfilter([1, -0.97], [1], x) # 预处理-分幀 xx = enframe(xx, frameSize, inc) # 预处理-加窗 xx = np.multiply(xx, np.hanning(frameSize)) # 计算FFT xx = np.fft.fft(xx) # 计算能量谱 xx = np.multiply(np.abs(xx), np.abs(xx)) # 计算通过Mel滤波器的能量 xx = xx[:, :frameSize // 2 + 1] bank = melbankm(p, frameSize, fs, 0, 0.5 * fs, 0) ss = np.matmul(xx, bank.T) # 计算DCT倒谱 n_dct = 20 M = bank.shape[0] m = np.array([i for i in range(M)]) mfcc = np.zeros((ss.shape[0], n_dct)) for n in range(n_dct): mfcc[:, n] = np.sqrt(2 / M) * np.sum(np.multiply( np.log(ss), np.cos((2 * m - 1) * n * np.pi / 2 / M)), axis=1) return mfcc
def pitch_Corr(x, wnd, inc, T1, fs, miniL=10): """ 自相关法基音周期检测函数 :param x: :param wnd: :param inc: :param T1: :param fs: :param miniL: :return: """ y = enframe(x, wnd, inc) fn = y.shape[0] if isinstance(wnd, int): wlen = wnd else: wlen = len(wnd) voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) lmin = fs // 500 # 基音周期的最小值 lmax = fs // 60 # 基音周期的最大值 period = np.zeros(fn) for i in range(vsl): ixb = voiceseg[i]['start'] ixd = voiceseg[i]['duration'] for k in range(ixd): ru = np.correlate(y[k + ixb, :], y[k + ixb, :], 'full') ru = ru[wlen:] tloc = np.argmax(ru[lmin:lmax]) period[k + ixb] = lmin + tloc return voiceseg, vsl, SF, Ef, period
def vad_specEN(data, wnd, inc, NIS, thr1, thr2, fs): from scipy.signal import medfilt x = enframe(data, wnd, inc) X = np.abs(np.fft.fft(x, axis=1)) if len(wnd) == 1: wlen = wnd else: wlen = len(wnd) df = fs / wlen fx1 = int(250 // df + 1) # 250Hz位置 fx2 = int(3500 // df + 1) # 500Hz位置 km = wlen // 8 K = 0.5 E = np.zeros((X.shape[0], wlen // 2)) E[:, fx1 + 1:fx2 - 1] = X[:, fx1 + 1:fx2 - 1] E = np.multiply(E, E) Esum = np.sum(E, axis=1, keepdims=True) P1 = np.divide(E, Esum) E = np.where(P1 >= 0.9, 0, E) Eb0 = E[:, 0::4] Eb1 = E[:, 1::4] Eb2 = E[:, 2::4] Eb3 = E[:, 3::4] Eb = Eb0 + Eb1 + Eb2 + Eb3 prob = np.divide(Eb + K, np.sum(Eb + K, axis=1, keepdims=True)) Hb = -np.sum(np.multiply(prob, np.log10(prob + 1e-10)), axis=1) for i in range(10): Hb = medfilt(Hb, 5) Me = np.mean(Hb) eth = np.mean(Hb[:NIS]) Det = eth - Me T1 = thr1 * Det + Me T2 = thr2 * Det + Me voiceseg, vsl, SF, NF = vad_revr(Hb, T1, T2) return voiceseg, vsl, SF, NF, Hb
def pitch_vad(x, wnd, inc, T1, miniL=10): """ 使用能熵比检测基音,实际上就是语音分段 :param x: :param wnd: :param inc: :param T1: :param miniL: :return: """ y = enframe(x, wnd, inc) fn = y.shape[0] if isinstance(wnd, int): wlen = wnd else: wlen = len(wnd) Sp = np.abs(np.fft.fft(y, axis=1)) Sp = Sp[:, :wlen // 2 + 1] Esum = np.sum(np.multiply(Sp, Sp), axis=1) prob = Sp / np.sum(Sp, axis=1, keepdims=True) H = -np.sum(np.multiply(prob, np.log10(prob + 1e-16)), axis=1) H = np.where(H < 0.1, np.max(H), H) Ef = np.sqrt(1 + np.abs(Esum / H)) Ef = Ef / np.max(Ef) zseg = findSegment(np.where(Ef > T1)[0]) zsl = len(zseg.keys()) SF = np.zeros(fn) for k in range(zsl): if zseg[k]['duration'] < miniL: zseg.pop(k) else: SF[zseg[k]['start']:zseg[k]['end']] = 1 return zseg, len(zseg.keys()), SF, Ef
def pitch_Ceps(x, wnd, inc, T1, fs, miniL=10): """ 倒谱法基音周期检测函数 :param x: :param wnd: :param inc: :param T1: :param fs: :param miniL: :return: """ y = enframe(x, wnd, inc) fn = y.shape[0] if isinstance(wnd, int): wlen = wnd else: wlen = len(wnd) voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) lmin = fs // 500 # 基音周期的最小值 lmax = fs // 60 # 基音周期的最大值 period = np.zeros(fn) y1 = y[np.where(SF == 1)[0], :] y1 = np.multiply(y1, np.hamming(wlen)) xx = np.fft.fft(y1, axis=1) b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-10)) Lc = np.argmax(b[:, lmin:lmax], axis=1) + lmin - 1 period[np.where(SF == 1)[0]] = Lc return voiceseg, vsl, SF, Ef, period
def vad_corr(y, wnd, inc, NIS, th1, th2): x = enframe(y, wnd, inc) Ru = STAc(x.T)[0] Rum = Ru / np.max(Ru) thredth = np.max(Rum[:NIS]) T1 = th1 * thredth T2 = th2 * thredth voiceseg, vsl, SF, NF = vad_forw(Rum, T1, T2) return voiceseg, vsl, SF, NF, Rum
def vad_pro(data, wnd, inc, NIS, thr1, thr2, mode): """ 使用比例法检测端点 :param data: :param wnd: :param inc: :param NIS: :param thr1: :param thr2: :param mode: :return: """ from scipy.signal import medfilt x = enframe(data, wnd, inc) if len(wnd) == 1: wlen = wnd else: wlen = len(wnd) if mode == 1: # 能零比 a = 2 b = 1 LEn = np.log10(1 + np.sum(np.multiply(x, x) / a, axis=1)) EZRn = LEn / (STZcr(data, wlen, inc) + b) for i in range(10): EZRn = medfilt(EZRn, 5) dth = np.mean(EZRn[:NIS]) T1 = thr1 * dth T2 = thr2 * dth Epara = EZRn elif mode == 2: # 能熵比 a = 2 X = np.abs(np.fft.fft(x, axis=1)) X = X[:, :wlen // 2] Esum = np.log10(1 + np.sum(np.multiply(X, X) / a, axis=1)) prob = X / np.sum(X, axis=1, keepdims=True) Hn = -np.sum(np.multiply(prob, np.log10(prob + 1e-10)), axis=1) Ef = np.sqrt(1 + np.abs(Esum / Hn)) for i in range(10): Ef = medfilt(Ef, 5) Me = np.max(Ef) eth = np.mean(Ef[NIS]) Det = Me - eth T1 = thr1 * Det + eth T2 = thr2 * Det + eth Epara = Ef voiceseg, vsl, SF, NF = vad_forw(Epara, T1, T2) return voiceseg, vsl, SF, NF, Epara
def pitch_Lpc(x, wnd, inc, T1, fs, p, miniL=10): """ 线性预测法基音周期检测函数 :param x: :param wnd: :param inc: :param T1: :param fs: :param p: :param miniL: :return: """ from scipy.signal import lfilter from chapter3_分析实验.lpc import lpc_coeff y = enframe(x, wnd, inc) fn = y.shape[0] if isinstance(wnd, int): wlen = wnd else: wlen = len(wnd) voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL) lmin = fs // 500 # 基音周期的最小值 lmax = fs // 60 # 基音周期的最大值 period = np.zeros(fn) for k in range(y.shape[0]): if SF[k] == 1: u = np.multiply(y[k, :], np.hamming(wlen)) ar, _ = lpc_coeff(u, p) ar[0] = 0 z = lfilter(-ar, [1], u) E = u - z xx = np.fft.fft(E) b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-20)) lc = np.argmax(b[lmin:lmax]) period[k] = lc + lmin return voiceseg, vsl, SF, Ef, period
plt.rcParams['axes.unicode_minus'] = False data, fs = soundBase('C7_3_y.wav').audioread() data -= np.mean(data) data /= np.max(np.abs(data)) data = lfilter([1, -0.99], 1, data) N = len(data) time = [i / fs for i in range(N)] # 设置时间 wlen = 240 inc = 80 overlap = wlen - inc n2 = [i for i in range(wlen // 2)] w1 = [i / overlap for i in range(overlap)] w2 = [i / overlap for i in range(overlap - 1, -1, -1)] wnd = np.hamming(wlen) X = enframe(data, wnd, inc) fn = X.shape[0] Etmp = np.sum(np.power(X, 2), axis=1) Etmp /= np.max(Etmp) T1, r2 = 0.1, 0.5 miniL = 10 mnlong = 5 ThrC = [10, 15] p = 12 frameTime = FrameTimeC(fn, wlen, inc, fs) Doption = 0 voiceseg, vosl, SF, Ef, period = pitch_Ceps(data, wlen, inc, T1, fs) Dpitch = pitfilterm1(period, voiceseg, vosl) ## 共振峰检测
from scipy.signal import lfilter plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False data, fs = soundBase('C7_2_y.wav').audioread() data -= np.mean(data) data /= np.max(np.abs(data)) N = len(data) time = [i / fs for i in range(N)] # 设置时间 p = 12 wlen, inc = 200, 80 msoverlap = wlen - inc y = enframe(data, wlen, inc) fn = y.shape[0] Acoef = np.zeros((y.shape[0], p + 1)) resid = np.zeros(y.shape) synFrame = np.zeros(y.shape) ## 7.2.1 # 求每帧的LPC系数与预测误差 for i in range(fn): a, _ = lpc_coeff(y[i, :], p) Acoef[i, :] = a resid[i, :] = lfilter(a, [1], y[i, :]) # 语音合成 for i in range(fn): synFrame[i, :] = lfilter([1], Acoef[i, :], resid[i, :])
from chapter2_基础.soundBase import * from chapter7_语音合成.flipframe import * from chapter3_分析实验.enframe import enframe plt.rcParams['font.sans-serif'] = ['SimHei'] plt.rcParams['axes.unicode_minus'] = False data, fs = soundBase('C7_1_y.wav').audioread() wlen = 256 wnd = np.hamming(wlen) overlap = 100 f = enframe(data, wnd, overlap) plt.figure(figsize=(14, 12)) # 7.1.1 fn_overlap = Filpframe_OverlapA(f, wnd, overlap) plt.subplot(3, 2, 1) plt.plot(data / np.max(np.abs(data)), 'k') plt.title('原始信号') plt.subplot(3, 2, 2) plt.title('还原信号-重叠相加法') plt.plot(fn_overlap / np.max(np.abs(fn_overlap)), 'c') # 7.1.2 fn_s = Filpframe_OverlapS(f, wnd, overlap) plt.subplot(3, 2, 3) plt.plot(data / np.max(np.abs(data)), 'k') plt.title('原始信号') plt.subplot(3, 2, 4) plt.title('还原信号-重叠存储法') plt.plot(fn_s / np.max(np.abs(fn_s)), 'c')
def vad_TwoThr(x, wlen, inc, NIS): """ 使用门限法检测语音段 :param x: 语音信号 :param wlen: 分帧长度 :param inc: 帧移 :param NIS: :return: """ maxsilence = 15 minlen = 5 status = 0 y = enframe(x, wlen, inc) fn = y.shape[0] amp = STEn(x, wlen, inc) zcr = STZcr(x, wlen, inc, delta=0.01) ampth = np.mean(amp[:NIS]) zcrth = np.mean(zcr[:NIS]) amp2 = 2 * ampth amp1 = 4 * ampth zcr2 = 2 * zcrth xn = 0 count = np.zeros(fn) silence = np.zeros(fn) x1 = np.zeros(fn) x2 = np.zeros(fn) for n in range(fn): if status == 0 or status == 1: if amp[n] > amp1: x1[xn] = max(1, n - count[xn] - 1) status = 2 silence[xn] = 0 count[xn] += 1 elif amp[n] > amp2 or zcr[n] > zcr2: status = 1 count[xn] += 1 else: status = 0 count[xn] = 0 x1[xn] = 0 x2[xn] = 0 elif status == 2: if amp[n] > amp2 and zcr[n] > zcr2: count[xn] += 1 else: silence[xn] += 1 if silence[xn] < maxsilence: count[xn] += 1 elif count[xn] < minlen: status = 0 silence[xn] = 0 count[xn] = 0 else: status = 3 x2[xn] = x1[xn] + count[xn] elif status == 3: status = 0 xn += 1 count[xn] = 0 silence[xn] = 0 x1[xn] = 0 x2[xn] = 0 el = len(x1[:xn]) if x1[el - 1] == 0: el -= 1 if x2[el - 1] == 0: print('Error: Not find endding point!\n') x2[el] = fn SF = np.zeros(fn) NF = np.ones(fn) for i in range(el): SF[int(x1[i]):int(x2[i])] = 1 NF[int(x1[i]):int(x2[i])] = 0 voiceseg = findSegment(np.where(SF == 1)[0]) vsl = len(voiceseg.keys()) return voiceseg, vsl, SF, NF, amp, zcr