예제 #1
0
def SpectralSub(signal, wlen, inc, NIS, a, b):
    """
    谱减法滤波
    :param signal:
    :param wlen:
    :param inc:
    :param NIS:
    :param a:
    :param b:
    :return:
    """
    wnd = np.hamming(wlen)
    y = enframe(signal, wnd, inc)
    fn, flen = y.shape
    y_a = np.abs(np.fft.fft(y, axis=1))
    y_a2 = np.power(y_a, 2)
    y_angle = np.angle(np.fft.fft(y, axis=1))
    Nt = np.mean(y_a2[:NIS, ], axis=0)

    y_a2 = np.where(y_a2 >= a * Nt, y_a2 - a * Nt, b * Nt)

    X = y_a2 * np.cos(y_angle) + 1j * y_a2 * np.sin(y_angle)
    hatx = np.real(np.fft.ifft(X, axis=1))

    sig = np.zeros(int((fn - 1) * inc + wlen))

    for i in range(fn):
        start = i * inc
        sig[start:start + flen] += hatx[i, :]
    return sig
예제 #2
0
def Nmfcc(x, fs, p, frameSize, inc):
    """
    计算mfcc系数
    :param x: 输入信号
    :param fs: 采样率
    :param p: Mel滤波器组的个数
    :param frameSize: 分帧的每帧长度
    :param inc: 帧移
    :return:
    """
    # 预处理-预加重
    xx = lfilter([1, -0.97], [1], x)
    # 预处理-分幀
    xx = enframe(xx, frameSize, inc)
    # 预处理-加窗
    xx = np.multiply(xx, np.hanning(frameSize))
    # 计算FFT
    xx = np.fft.fft(xx)
    # 计算能量谱
    xx = np.multiply(np.abs(xx), np.abs(xx))
    # 计算通过Mel滤波器的能量
    xx = xx[:, :frameSize // 2 + 1]
    bank = melbankm(p, frameSize, fs, 0, 0.5 * fs, 0)
    ss = np.matmul(xx, bank.T)
    # 计算DCT倒谱
    n_dct = 20
    M = bank.shape[0]
    m = np.array([i for i in range(M)])
    mfcc = np.zeros((ss.shape[0], n_dct))
    for n in range(n_dct):
        mfcc[:, n] = np.sqrt(2 / M) * np.sum(np.multiply(
            np.log(ss), np.cos((2 * m - 1) * n * np.pi / 2 / M)),
                                             axis=1)
    return mfcc
예제 #3
0
def pitch_Corr(x, wnd, inc, T1, fs, miniL=10):
    """
    自相关法基音周期检测函数
    :param x: 
    :param wnd: 
    :param inc: 
    :param T1: 
    :param fs: 
    :param miniL: 
    :return: 
    """
    y = enframe(x, wnd, inc)
    fn = y.shape[0]
    if isinstance(wnd, int):
        wlen = wnd
    else:
        wlen = len(wnd)
    voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL)
    lmin = fs // 500  # 基音周期的最小值
    lmax = fs // 60  # 基音周期的最大值
    period = np.zeros(fn)
    for i in range(vsl):
        ixb = voiceseg[i]['start']
        ixd = voiceseg[i]['duration']
        for k in range(ixd):
            ru = np.correlate(y[k + ixb, :], y[k + ixb, :], 'full')
            ru = ru[wlen:]
            tloc = np.argmax(ru[lmin:lmax])
            period[k + ixb] = lmin + tloc

    return voiceseg, vsl, SF, Ef, period
예제 #4
0
def vad_specEN(data, wnd, inc, NIS, thr1, thr2, fs):
    from scipy.signal import medfilt
    x = enframe(data, wnd, inc)
    X = np.abs(np.fft.fft(x, axis=1))
    if len(wnd) == 1:
        wlen = wnd
    else:
        wlen = len(wnd)
    df = fs / wlen
    fx1 = int(250 // df + 1)  # 250Hz位置
    fx2 = int(3500 // df + 1)  # 500Hz位置
    km = wlen // 8
    K = 0.5
    E = np.zeros((X.shape[0], wlen // 2))
    E[:, fx1 + 1:fx2 - 1] = X[:, fx1 + 1:fx2 - 1]
    E = np.multiply(E, E)
    Esum = np.sum(E, axis=1, keepdims=True)
    P1 = np.divide(E, Esum)
    E = np.where(P1 >= 0.9, 0, E)
    Eb0 = E[:, 0::4]
    Eb1 = E[:, 1::4]
    Eb2 = E[:, 2::4]
    Eb3 = E[:, 3::4]
    Eb = Eb0 + Eb1 + Eb2 + Eb3
    prob = np.divide(Eb + K, np.sum(Eb + K, axis=1, keepdims=True))
    Hb = -np.sum(np.multiply(prob, np.log10(prob + 1e-10)), axis=1)
    for i in range(10):
        Hb = medfilt(Hb, 5)
    Me = np.mean(Hb)
    eth = np.mean(Hb[:NIS])
    Det = eth - Me
    T1 = thr1 * Det + Me
    T2 = thr2 * Det + Me
    voiceseg, vsl, SF, NF = vad_revr(Hb, T1, T2)
    return voiceseg, vsl, SF, NF, Hb
예제 #5
0
def pitch_vad(x, wnd, inc, T1, miniL=10):
    """
    使用能熵比检测基音,实际上就是语音分段
    :param x:
    :param wnd:
    :param inc:
    :param T1:
    :param miniL:
    :return:
    """
    y = enframe(x, wnd, inc)
    fn = y.shape[0]
    if isinstance(wnd, int):
        wlen = wnd
    else:
        wlen = len(wnd)

    Sp = np.abs(np.fft.fft(y, axis=1))
    Sp = Sp[:, :wlen // 2 + 1]
    Esum = np.sum(np.multiply(Sp, Sp), axis=1)
    prob = Sp / np.sum(Sp, axis=1, keepdims=True)
    H = -np.sum(np.multiply(prob, np.log10(prob + 1e-16)), axis=1)
    H = np.where(H < 0.1, np.max(H), H)
    Ef = np.sqrt(1 + np.abs(Esum / H))
    Ef = Ef / np.max(Ef)

    zseg = findSegment(np.where(Ef > T1)[0])
    zsl = len(zseg.keys())
    SF = np.zeros(fn)
    for k in range(zsl):
        if zseg[k]['duration'] < miniL:
            zseg.pop(k)
        else:
            SF[zseg[k]['start']:zseg[k]['end']] = 1
    return zseg, len(zseg.keys()), SF, Ef
예제 #6
0
def pitch_Ceps(x, wnd, inc, T1, fs, miniL=10):
    """
    倒谱法基音周期检测函数
    :param x:
    :param wnd:
    :param inc:
    :param T1:
    :param fs:
    :param miniL:
    :return:
    """
    y = enframe(x, wnd, inc)
    fn = y.shape[0]
    if isinstance(wnd, int):
        wlen = wnd
    else:
        wlen = len(wnd)
    voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL)
    lmin = fs // 500  # 基音周期的最小值
    lmax = fs // 60  # 基音周期的最大值
    period = np.zeros(fn)
    y1 = y[np.where(SF == 1)[0], :]
    y1 = np.multiply(y1, np.hamming(wlen))
    xx = np.fft.fft(y1, axis=1)
    b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-10))
    Lc = np.argmax(b[:, lmin:lmax], axis=1) + lmin - 1
    period[np.where(SF == 1)[0]] = Lc
    return voiceseg, vsl, SF, Ef, period
예제 #7
0
def vad_corr(y, wnd, inc, NIS, th1, th2):
    x = enframe(y, wnd, inc)
    Ru = STAc(x.T)[0]
    Rum = Ru / np.max(Ru)
    thredth = np.max(Rum[:NIS])
    T1 = th1 * thredth
    T2 = th2 * thredth
    voiceseg, vsl, SF, NF = vad_forw(Rum, T1, T2)
    return voiceseg, vsl, SF, NF, Rum
예제 #8
0
def vad_pro(data, wnd, inc, NIS, thr1, thr2, mode):
    """
    使用比例法检测端点
    :param data:
    :param wnd:
    :param inc:
    :param NIS:
    :param thr1:
    :param thr2:
    :param mode:
    :return:
    """
    from scipy.signal import medfilt
    x = enframe(data, wnd, inc)
    if len(wnd) == 1:
        wlen = wnd
    else:
        wlen = len(wnd)
    if mode == 1:  # 能零比
        a = 2
        b = 1
        LEn = np.log10(1 + np.sum(np.multiply(x, x) / a, axis=1))
        EZRn = LEn / (STZcr(data, wlen, inc) + b)
        for i in range(10):
            EZRn = medfilt(EZRn, 5)
        dth = np.mean(EZRn[:NIS])
        T1 = thr1 * dth
        T2 = thr2 * dth
        Epara = EZRn
    elif mode == 2:  # 能熵比
        a = 2
        X = np.abs(np.fft.fft(x, axis=1))
        X = X[:, :wlen // 2]
        Esum = np.log10(1 + np.sum(np.multiply(X, X) / a, axis=1))
        prob = X / np.sum(X, axis=1, keepdims=True)
        Hn = -np.sum(np.multiply(prob, np.log10(prob + 1e-10)), axis=1)
        Ef = np.sqrt(1 + np.abs(Esum / Hn))
        for i in range(10):
            Ef = medfilt(Ef, 5)
        Me = np.max(Ef)
        eth = np.mean(Ef[NIS])
        Det = Me - eth
        T1 = thr1 * Det + eth
        T2 = thr2 * Det + eth
        Epara = Ef
    voiceseg, vsl, SF, NF = vad_forw(Epara, T1, T2)
    return voiceseg, vsl, SF, NF, Epara
예제 #9
0
def pitch_Lpc(x, wnd, inc, T1, fs, p, miniL=10):
    """
    线性预测法基音周期检测函数
    :param x:
    :param wnd:
    :param inc:
    :param T1:
    :param fs:
    :param p:
    :param miniL:
    :return:
    """
    from scipy.signal import lfilter
    from chapter3_分析实验.lpc import lpc_coeff
    y = enframe(x, wnd, inc)
    fn = y.shape[0]
    if isinstance(wnd, int):
        wlen = wnd
    else:
        wlen = len(wnd)
    voiceseg, vsl, SF, Ef = pitch_vad(x, wnd, inc, T1, miniL)
    lmin = fs // 500  # 基音周期的最小值
    lmax = fs // 60  # 基音周期的最大值
    period = np.zeros(fn)
    for k in range(y.shape[0]):
        if SF[k] == 1:
            u = np.multiply(y[k, :], np.hamming(wlen))
            ar, _ = lpc_coeff(u, p)
            ar[0] = 0
            z = lfilter(-ar, [1], u)
            E = u - z
            xx = np.fft.fft(E)
            b = np.fft.ifft(2 * np.log(np.abs(xx) + 1e-20))
            lc = np.argmax(b[lmin:lmax])
            period[k] = lc + lmin
    return voiceseg, vsl, SF, Ef, period
예제 #10
0
plt.rcParams['axes.unicode_minus'] = False

data, fs = soundBase('C7_3_y.wav').audioread()
data -= np.mean(data)
data /= np.max(np.abs(data))
data = lfilter([1, -0.99], 1, data)
N = len(data)
time = [i / fs for i in range(N)]  # 设置时间
wlen = 240
inc = 80
overlap = wlen - inc
n2 = [i for i in range(wlen // 2)]
w1 = [i / overlap for i in range(overlap)]
w2 = [i / overlap for i in range(overlap - 1, -1, -1)]
wnd = np.hamming(wlen)
X = enframe(data, wnd, inc)
fn = X.shape[0]
Etmp = np.sum(np.power(X, 2), axis=1)
Etmp /= np.max(Etmp)
T1, r2 = 0.1, 0.5
miniL = 10
mnlong = 5
ThrC = [10, 15]
p = 12

frameTime = FrameTimeC(fn, wlen, inc, fs)
Doption = 0

voiceseg, vosl, SF, Ef, period = pitch_Ceps(data, wlen, inc, T1, fs)
Dpitch = pitfilterm1(period, voiceseg, vosl)
## 共振峰检测
예제 #11
0
from scipy.signal import lfilter

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

data, fs = soundBase('C7_2_y.wav').audioread()

data -= np.mean(data)
data /= np.max(np.abs(data))
N = len(data)
time = [i / fs for i in range(N)]  # 设置时间
p = 12
wlen, inc = 200, 80
msoverlap = wlen - inc
y = enframe(data, wlen, inc)
fn = y.shape[0]
Acoef = np.zeros((y.shape[0], p + 1))
resid = np.zeros(y.shape)
synFrame = np.zeros(y.shape)
## 7.2.1

# 求每帧的LPC系数与预测误差
for i in range(fn):
    a, _ = lpc_coeff(y[i, :], p)
    Acoef[i, :] = a
    resid[i, :] = lfilter(a, [1], y[i, :])

# 语音合成
for i in range(fn):
    synFrame[i, :] = lfilter([1], Acoef[i, :], resid[i, :])
예제 #12
0
from chapter2_基础.soundBase import *
from chapter7_语音合成.flipframe import *
from chapter3_分析实验.enframe import enframe

plt.rcParams['font.sans-serif'] = ['SimHei']
plt.rcParams['axes.unicode_minus'] = False

data, fs = soundBase('C7_1_y.wav').audioread()

wlen = 256
wnd = np.hamming(wlen)
overlap = 100
f = enframe(data, wnd, overlap)
plt.figure(figsize=(14, 12))
# 7.1.1
fn_overlap = Filpframe_OverlapA(f, wnd, overlap)
plt.subplot(3, 2, 1)
plt.plot(data / np.max(np.abs(data)), 'k')
plt.title('原始信号')
plt.subplot(3, 2, 2)
plt.title('还原信号-重叠相加法')
plt.plot(fn_overlap / np.max(np.abs(fn_overlap)), 'c')

# 7.1.2
fn_s = Filpframe_OverlapS(f, wnd, overlap)
plt.subplot(3, 2, 3)
plt.plot(data / np.max(np.abs(data)), 'k')
plt.title('原始信号')
plt.subplot(3, 2, 4)
plt.title('还原信号-重叠存储法')
plt.plot(fn_s / np.max(np.abs(fn_s)), 'c')
예제 #13
0
def vad_TwoThr(x, wlen, inc, NIS):
    """
    使用门限法检测语音段
    :param x: 语音信号
    :param wlen: 分帧长度
    :param inc: 帧移
    :param NIS:
    :return:
    """
    maxsilence = 15
    minlen = 5
    status = 0
    y = enframe(x, wlen, inc)
    fn = y.shape[0]
    amp = STEn(x, wlen, inc)
    zcr = STZcr(x, wlen, inc, delta=0.01)
    ampth = np.mean(amp[:NIS])
    zcrth = np.mean(zcr[:NIS])
    amp2 = 2 * ampth
    amp1 = 4 * ampth
    zcr2 = 2 * zcrth
    xn = 0
    count = np.zeros(fn)
    silence = np.zeros(fn)
    x1 = np.zeros(fn)
    x2 = np.zeros(fn)
    for n in range(fn):
        if status == 0 or status == 1:
            if amp[n] > amp1:
                x1[xn] = max(1, n - count[xn] - 1)
                status = 2
                silence[xn] = 0
                count[xn] += 1
            elif amp[n] > amp2 or zcr[n] > zcr2:
                status = 1
                count[xn] += 1
            else:
                status = 0
                count[xn] = 0
                x1[xn] = 0
                x2[xn] = 0

        elif status == 2:
            if amp[n] > amp2 and zcr[n] > zcr2:
                count[xn] += 1
            else:
                silence[xn] += 1
                if silence[xn] < maxsilence:
                    count[xn] += 1
                elif count[xn] < minlen:
                    status = 0
                    silence[xn] = 0
                    count[xn] = 0
                else:
                    status = 3
                    x2[xn] = x1[xn] + count[xn]
        elif status == 3:
            status = 0
            xn += 1
            count[xn] = 0
            silence[xn] = 0
            x1[xn] = 0
            x2[xn] = 0
    el = len(x1[:xn])
    if x1[el - 1] == 0:
        el -= 1
    if x2[el - 1] == 0:
        print('Error: Not find endding point!\n')
        x2[el] = fn
    SF = np.zeros(fn)
    NF = np.ones(fn)
    for i in range(el):
        SF[int(x1[i]):int(x2[i])] = 1
        NF[int(x1[i]):int(x2[i])] = 0
    voiceseg = findSegment(np.where(SF == 1)[0])
    vsl = len(voiceseg.keys())
    return voiceseg, vsl, SF, NF, amp, zcr