def calc_ssnr(ref_sig, sig, frame_size, mid_only=False): ''' 计算分段信噪比 :param ref_sig: 作为参考(reference)的干净的信号 :param sig: 待评估的语音信号 :param frame_size: 帧的size :param mid_only:判断是否是中点 :return: 计算得到的分段信噪比 ''' ref_frame = sdcore.frame(ref_sig, frame_size, frame_size) # 用于生成相应的frame array deg_frame = sdcore.frame(sig, frame_size, frame_size) if mid_only: # 如果是从中点的话,只需要计算帧的一半即可,防止计算冗余 i = len(ref_frame) // 2 ref_frame = ref_frame[i, :] deg_frame = deg_frame[i, :] noise_frame = ref_frame - deg_frame ref_energy = np.sum(ref_frame**2, axis=-1) + min_pf noise_energy = np.sum(noise_frame**2, axis=-1) + min_pf ssnr = 10 * np.log10(ref_energy / noise_energy) if mid_only: # return min(ssnr_max, max(ssnr_min, ssnr)) return ssnr else: ssnr[ssnr < ssnr_min] = ssnr_min ssnr[ssnr > ssnr_max] = ssnr_max return np.mean(ssnr)
def calc_ssnr(ref_sig, deg_sig, frame_size, mid_only=False): # 计算分段信噪比 ref_frame = sdcore.frame(ref_sig, frame_size, frame_size, center=False) deg_frame = sdcore.frame(deg_sig, frame_size, frame_size, center=False) if mid_only: i = len(ref_frame) // 2 ref_frame = ref_frame[i, :] deg_frame = deg_frame[i, :] noise_frame = ref_frame - deg_frame ref_energy = np.sum(ref_frame**2, axis=-1) + min_pf noise_energy = np.sum(noise_frame**2, axis=-1) + min_pf ssnr = 10 * np.log10(ref_energy / noise_energy) if mid_only: # return min(ssnr_max, max(ssnr_min, ssnr)) return ssnr else: ssnr[ssnr < ssnr_min] = ssnr_min ssnr[ssnr > ssnr_max] = ssnr_max return np.mean(ssnr)
def compute_feat(wav, feat, samplerate, windowsize, hop_point, norm=False, center=False): ''' 从时域信号中计算出feat特征 :param feat: 特征类型 ''' if type(feat) == list or type(feat) == tuple: feature = [compute_feat(wav, f, samplerate, windowsize, hop_point, norm=norm, center=center) for f in feat] return np.concatenate(feature, axis=1) if feat == FEAT_MAGNITUDE: frames = core.frame(wav, windowsize, hop_point, center=center) feature = core.stft(frames) feature = np.absolute(feature) elif feat == FEAT_LOG_MEL: feature = mel_spec.melspectrogram(wav, samplerate, windowsize, hop_point, MEL_COEF_N) feature = mel_spec.power_to_db(feature) elif feat == FEAT_PNCC: feature = pncc.calc_pncc(wav, samplerate, windowsize, hop_point) elif feat == FEAT_GF: feature = gammatone.gammatonegram(wav, samplerate, windowsize, hop_point, GAMMATONE_FILTER_N) elif feat == FEAT_GFCC: feature = gammatone.gammatonegram(wav, samplerate, windowsize, hop_point, GAMMATONE_FILTER_N) feature = gammatone.gtm2gfcc(feature, dct_stop=31) elif feat == FEAT_SPECTRUM: frames = core.frame(wav, windowsize, hop_point, center=center) feature = core.stft(frames) elif feat == FEAT_PHASE: frames = core.frame(wav, windowsize, hop_point, center=center) feature = core.stft(frames) feature = np.angle(feature) elif feat == FEAT_WAV: # raw feature = core.frame(wav, windowsize, hop_point, center=center) else: raise ValueError("feature %s not support" % feat) if norm: feature = [standardize(f) for f in feature] return np.array(feature)
def spectrogram(y=None, frames_size=410, frame_shift=160, power=2): """ :param y: :param power: :param frames_size: :param frame_shift: :return: """ frames = core.frame(wav=y, frame_size=frames_size, frame_shift=frame_shift) # compute a magnitude spectrogram from input spect = np.abs(core.stft(frames=frames)) spect = spect.T**power # print('spect from spectrogram is', spect) # print('hahhahahahh') return spect
def gammatonegram(X, samplerate=16000, frame_size=410, frame_shift=160, nfilter=64, low_freq=100, high_freq=None, fft_proc=True, width=1): if high_freq is None: high_freq = samplerate // 2 if not fft_proc: fcoefs = make_erb_filters(samplerate, nfilter, low_freq) fcoefs = np.flipud(fcoefs) XF = pass_erb_filterbank(X, fcoefs) XE = XF ** 2 num_frame = 1 + (XE.shape[1] - frame_size) // frame_shift Y = np.zeros((nfilter, num_frame)) for i in range(num_frame): Y[:, i] = np.sqrt(np.mean(XE[:, i:i+frame_size], axis=1)) else: nfft = frame_size gtm = fft2gammatonemx(nfft, samplerate, nfilter, low_freq, width, high_freq, nfft // 2 + 1) frame = core.frame(X, frame_size, frame_shift) FFTX = core.stft(frame).T Y = 1 / nfft * np.dot(gtm, np.abs(FFTX)) return Y.T
def calc_pncc(sig, sr=_SAMPLE_RATE, frame_size=410, frame_shift=160): ''' 计算Power-Normalized Cepstral Coefficients (PNCC)功率归一化倒谱系数 :param sig: 原始时域信号 :param sr: 采样频率,默认16kHz,若不同,会首先重采样 :param frame_size: 每帧采样点数,默认对应25.6ms :param frame_shift: 每帧移动采样点数,默认对应10ms :return: PNCC特征 ''' # if sr != _SAMPLE_RATE: # sig = librosa.resample(sig, sr, _SAMPLE_RATE) # STFT frames = basic.frame(sig, frame_size, frame_shift) frames = basic.stft(frames, n_fft=_FFT_SIZE, window='hamming', half=False) frames = frames[:, :_FFT_SIZE // 2] frames = np.abs(frames) num_frame = len(frames) if _PRE_EMPHSIS: # Pre - emphasis using H(z) = 1 - 0.97 z^-1 sig[1:] -= 0.97 * sig[:-1] # Obtaning the gammatone coefficient. aad_H = _calc_gammatone_filter_response(_FILTER_NUM, _FFT_SIZE, sr) aad_H = np.abs(_normalize_gain(aad_H)) # x = sig[:_FRAME_SIZE] # w = scipy.signal.get_window('hamming', _FRAME_SIZE) # c = w * x # scipy.io.savemat('sig.mat', {'sig':c}) # X = np.fft.fft(c, _FFT_SIZE) # Obtaining the short-time Power aad_P, ad_sum_P = [], [] aad_HT = aad_H.T for frame in frames: aad_P.append(np.sum((aad_HT * frame) ** 2, axis=1)) ad_sum_P.append(np.sum(aad_P[-1])) aad_P = np.array(aad_P).T # window = scipy.signal.get_window('hamming', _FRAME_SIZE) # aad_P = np.zeros((_FILTER_NUM, num_frame)) # ad_sum_P = [] # fi = 0 # for i in range(0, len(sig) - _FRAME_SIZE + 1, _FRAME_SHIFT): # ad_x_st = sig[i:i+_FRAME_SIZE] # ad_x_st = ad_x_st * window # adSpec = np.fft.fft(ad_x_st, _FFT_SIZE) # ad_X = np.abs(adSpec[:_FFT_SIZE//2]) # for j in range(_FILTER_NUM): # aad_P[j, fi] = np.sum((ad_X * aad_H[:, j])**2) # ad_sum_P.append(np.sum(aad_P[:, fi])) # fi += 1 # Peak Power Normalization Using 95 % percentile ad_sum_P.sort() max_p = ad_sum_P[np.round(0.95 * len(ad_sum_P)).astype(int) - 1] aad_P = aad_P / max_p * _NORM_POWER # scipy.io.savemat('aad_P.mat', {'aad_P_tmp': aad_P}) if _CALC_MEDIUM_DURATION: # Medium-duration power calculation aad_Q = [] for i in range(_FILTER_NUM): q = [] for j in range(num_frame): # frame number q.append(np.mean( aad_P[i, max(0, j - _SMTH_FRM):min(num_frame, j + _SMTH_FRM + 1)])) aad_Q.append(q) aad_Q = np.array(aad_Q) aad_w = [] for i in range(_FILTER_NUM): aad_tildeQ = _power_bias_sub(aad_Q[i, :], _DELTA) aad_w.append(_max(aad_tildeQ, _EPS) / _max(aad_Q[i, :], _EPS)) aad_w = np.array(aad_w) # Weight smoothing aross channels aad_w_Smooth = np.zeros(aad_Q.shape) for i in range(_FILTER_NUM): for j in range(num_frame): aad_w_Smooth[i, j] = np.mean( aad_w[max(i - _SMTH_FLT, 0):min(i + _SMTH_FLT + 1, _FILTER_NUM), j]) aad_P *= aad_w_Smooth # aad_P = aad_P[:, _SMTH_FRM:aad_P.shape[1] - _SMTH_FRM - 1] # Apply the nonlinearity if _POWER_NONLINEARITY: aadSpec = aad_P ** _POWER_COEFF else: aadSpec = np.log(aad_P + _EPS) # DCT aadDCT = dct(aadSpec, norm='ortho', axis=0) aadDCT = aadDCT[:DCT_NUM, :] # CMN for i in range(DCT_NUM): aadDCT[i, :] -= np.mean(aadDCT[i, :]) return aadDCT.T