def mel_dist(x1, x2, fs, num, wlen, inc): """ 计算两信号x1,x2的MFCC参数和距离 :param x1: signal 1 :param x2: signal 2 :param fs: sample frequency :param num: the number we select in MFCC :param wlen: frame length :param inc: frame shift :return Dcep: distance :return Ccep1, Ccep2: num MFCC """ M = MFCC() ccc1 = M.mfcc(x1, Fs, num, wlen, inc) # MFCC ccc2 = M.mfcc(x2, Fs, num, wlen, inc) fn1 = np.shape(ccc1)[0] # frame number Ccep1 = ccc1[:, 0 : num] Ccep2 = ccc2[:, 0 : num] Dcep = np.zeros(fn1) # distance for i in range(fn1): Cn1 = Ccep1[i, :] Cn2 = Ccep2[i, :] Dstu = 0 for k in range(num): Dstu = Dstu + (Cn1[k] - Cn2[k]) ** 2 Dcep[i] = np.sqrt(Dstu) return Dcep, Ccep1, Ccep2
def super_vector(test_file_name, ubm_file): wav = mywave() waveData = wav.WaveRead(test_file_name) waveVadIdx = vad(waveData**2) waveData = waveData[waveVadIdx] MFCC_obj = MFCC(40, 12, 300, 3400, 0.97, 16000, 50, 0.0256, 256) MFCC_coef = MFCC_obj.sig2s2mfc(waveData) ubm = GMM(n_mix=128, n_dim=12) ubm.read(ubm_file) ubm.adapt(MFCC_coef) return ubm.means
def test(signal, fs, feat_list): mean = np.average(signal) energy = np.sum(np.abs(signal - mean)) signal = signal / energy * 100 mfcc = MFCC.MFCC(signal, fs, Frame_Len, Hop_Len) digit = 0 c = DTW.DTW(mfcc, feat_list[0]) dis = c for i in range(0, len(feat_list)): c = DTW.DTW(mfcc, feat_list[i]) if c < dis: digit = i dis = c return digit, dis
speech = Speech() xx, fs = speech.audioread(filename, 8000) xx = xx - np.mean(xx) # DC x = xx / np.max(xx) # normalized N = len(x) time = np.arange(N) / fs noisy = Noisy() signal, _ = noisy.Gnoisegen(x, SNR) # add noise wnd = np.hamming(wlen) # window function overlap = wlen - inc NIS = int((IS * fs - wlen) / inc + 1) # unvoice segment frame number y = speech.enframe(signal, list(wnd), inc).T fn = y.shape[1] # frame number frameTime = speech.FrameTime(fn, wlen, inc, fs) # frame to time Mfcc = MFCC() ccc = Mfcc.mfcc(signal, fs, 16, wlen, inc) # MFCC fn1 = ccc.shape[0] # frame number frameTime1 = frameTime[2:fn - 2] Ccep = ccc[:, 0:16] # MFCC coefficient C0 = np.mean( Ccep[0:5, :], axis=0) # calculate approximate average noise MFCC coefficient Dcep = np.zeros(fn) for i in range(5, fn1): Cn = Ccep[i, :] # one frame MFCC cepstrum coefficient Dstu = 0 for k in range(16): # calculate the MFCC cepstrum distance Dstu += (Cn[k] - C0[k])**2 # between each frame and noise Dcep[i] = np.sqrt(Dstu) Dcep[0:5] = Dcep[5]
def template(signal, fs): mean = np.average(signal) energy = np.sum(np.abs(signal - mean)) signal = signal / energy * 100 mfcc = MFCC.MFCC(signal, fs, Frame_Len, Hop_Len) return mfcc
ubm_dir = 'train_data_for_UBM' ubm_data_dirs = os.listdir(ubm_dir) dim = 12 sig = np.array([]) features_M = np.ndarray(shape=(0, dim), dtype='float64') features_F = np.ndarray(shape=(0, dim), dtype='float64') features = np.ndarray(shape=(0, dim), dtype='float64') wav = mywave() print 'hello' for ubm_data_dir in ubm_data_dirs: print 'hello' print ubm_data_dir if ubm_data_dir == '.DS_Store': continue sig = wav.WaveRead(ubm_dir + r'/' + ubm_data_dir) MFCC_obj = MFCC(40, 12, 300, 3400, 0.97, 16000, 50, 0.0256, 256) MFCC_coef = MFCC_obj.sig2s2mfc(sig) #energy = np.ndarray(shape = (MFCC_coef.shape[0],1),dtype = 'float64') #energy[:,0] = 10*numpy.log10((MFCC_coef**2).sum(axis=1)) #MFCC_coef = np.hstack((MFCC_coef,energy)) """ dtm1 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm1 dtm1[0:2,:] = 0 dtm1[MFCC_coef.shape[0]-2:MFCC_coef.shape[0],:] = 0; #计算dtm1 for loop2 in range(2,MFCC_coef.shape[0]-2): dtm1[loop2,:] = -2*MFCC_coef[loop2-2,:]-MFCC_coef[loop2-1,:]+MFCC_coef[loop2+1,:]+2*MFCC_coef[loop2+2,:] dtm1 = dtm1/3; dtm2 = np.ndarray(shape = MFCC_coef.shape,dtype = 'float64' ) #初始化dtm2
length = 10*samprate ubms_dir = 'ubms' supervector_dir = 'supervector' if not os.path.exists(supervector_dir): os.mkdir(supervector_dir) train_data_dir = 'train_data' train_data = os.listdir(train_data_dir) wav = mywave.mywave() special = ['train_2013011050_mlh_M.wav','train_2013011055_huangyiqing_M.wav', 'train_2013012061_huangxinyuan_M.wav'] MFCC_obj = MFCC(40,12,300,3400,0.97,samprate,50,0.0256,256) dim = 12 for train_wav in train_data: if train_wav == '.DS_Store': continue print train_wav if not os.path.exists(supervector_dir+r'/'+train_wav[6:16]): os.mkdir(supervector_dir+r'/'+train_wav[6:16]) wave = wav.WaveRead(train_data_dir+r'/'+train_wav) piece_num = int(wave.shape[0]/length) print 'sum of piece:', piece_num temp = range(piece_num) for loop in temp: print 'piece:',loop+1 beginp = loop*samprate
glob.glob('Train/*') ): # pick one speaker at a time in the folder Train name_speaker = speaker # print(name_speaker) speaker_list.update( {name_speaker: key}) # store the speaker along with a key in the list speaker_data = [ ] # to store the mel frequency cepstral coefficients for different audio signals of # same speaker for speaker_train in glob.glob( speaker + '/*.wav'): # pick all the audio clips of each speaker mfcc = feature.MFCC( speaker_train) # extract mfcc for each audio clip if len(speaker_data) == 0: speaker_data = mfcc else: speaker_data = numpy.concatenate( (speaker_data, mfcc) ) # store mfccs of all audio clips of single speaker # print(speaker_data.shape) gaussian_mixture_model = mixture.GaussianMixture( n_components=num, covariance_type=matrix, max_iter=100, init_params=initial) # create gmm using library in # sklearn package gaussian_mixture_model = gaussian_mixture_model.fit(
def MFCC(signal): MFCC.MFCC()