def MFSC(signal, samplerate=16000, winlen=0.025, winstep=0.01, numcep=13, nfilt=26, nfft=512, lowfreq=0, highfreq=None, preemph=0.97, ceplifter=22, appendEnergy=True): """Compute mel-frequency filterbank features from an audio signal (MFCC without DCT) :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ feat, energy = fbank(signal, samplerate, winlen, winstep, nfilt, nfft, lowfreq, highfreq, preemph) feat = np.log(feat) return feat, energy
def filter(samplerate, signal, winlen=0.02, winstep=0.01, nfilt=40, nfft=512, lowfreq=100, highfreq=5000, preemph=0.97): """extracts mel filterbank energies from a given signal Args: samplerate (int): samples taken per second signal(1d numpy array): sample values winlen(float): sliding window size in seconds winstep(float): overlap of sliding windows in seconds nfilt(int): number of mel filters to apply nfft(int): size of the discrete fourier transform to use lowfreq(int): lowest frequency to collect highfreq(int): highest frequency to collect preemph(float): preemphesis factor Returns: feat(2d numpy array): filterbank energies """ feat, energy = speechfeatures.fbank( np.array(signal), samplerate, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, preemph=preemph, ) return np.swapaxes(feat, 0, 1)
def computeLogMelFilterBank(self, file_name): ''' Compute the log-mel frequency filterbank feature vector with deltas and double deltas ''' (rate, sig) = wav.read(file_name) fbank_feat, energy = fbank(sig,rate, winlen=0.025,winstep=0.01, nfilt=40) fbank_feat = np.log(fbank_feat) fbank_feat = np.vstack((fbank_feat.transpose(), energy.transpose())).transpose() deltas = self.computeDeltas(fbank_feat) assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \ {1}".format(deltas.shape, fbank_feat.shape) feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose())) double_deltas = self.computeDeltas(deltas) feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose() assert len(feat_vec[0]) == 123, "Something wrong with feature vector dimensions..." return feat_vec
def get_features(sample): rate, sig = sample mfcc_feats = mfcc(sig, rate) def diff(feats): feats_diff = numpy.zeros(feats.shape) for i in range(2, feats.shape[0]-2): feats_diff[i,:] = 2*feats[i-2,:] - feats[i-2,:] + feats[i+1,:] + 2*feats[i+2,:] return feats_diff mfcc_diff_feats = diff(mfcc_feats) mfcc_diff2_feats = diff(mfcc_diff_feats) _, energy_feat = fbank(sig, rate) log_energy_feat = numpy.log(energy_feat).reshape(energy_feat.shape[0],1) return numpy.concatenate((mfcc_feats, mfcc_diff_feats, mfcc_diff2_feats, log_energy_feat), axis=1)[2:-2]
def wav2fbank(wavFile, fs=16000, maxLen_s=None): if isinstance(wavFile, str): (fs, wav) = wavfile.read(wavFile) assert fs == 16000 # requirement for now elif isinstance(wavFile, np.ndarray): wav = wavFile winlen = 0.025 winstep = 0.015 nfft = np.int(np.power(2, np.ceil(np.log2(winlen * fs)))) winfunc = lambda x: np.hanning(x) nfilt = 40 preemph = 0.97 if np.ndim(wav) == 2: # Multiple channels; just take left one wav = wav[:, 0] if maxLen_s is not None: maxSamp = maxLen_s * fs wav = wav[:maxSamp] if True: M, E = fbank(wav, fs, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, winfunc=winfunc, preemph=preemph) logM = np.log(M) else: logM = mfcc(wav, fs, numcep=16, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, winfunc=winfunc, preemph=preemph) logM = np.swapaxes(logM, 0, 1) return logM
def MFSC(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): """Compute mel-frequency filterbank features from an audio signal (MFCC without DCT) :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ feat, energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) feat = np.log(feat) return feat, energy
def get_features(sample): rate, sig = sample mfcc_feats = mfcc(sig, rate) def diff(feats): feats_diff = numpy.zeros(feats.shape) for i in range(2, feats.shape[0] - 2): feats_diff[i, :] = 2 * feats[i - 2, :] - feats[ i - 2, :] + feats[i + 1, :] + 2 * feats[i + 2, :] return feats_diff mfcc_diff_feats = diff(mfcc_feats) mfcc_diff2_feats = diff(mfcc_diff_feats) _, energy_feat = fbank(sig, rate) log_energy_feat = numpy.log(energy_feat).reshape( energy_feat.shape[0], 1) return numpy.concatenate( (mfcc_feats, mfcc_diff_feats, mfcc_diff2_feats, log_energy_feat), axis=1)[2:-2]
def computeFBANKDeltaDelta(sswi,NFilt=40,NDelta=2): nframes = speakersent.countFrames(sswi) NFilt=NFilt+1 # Energy values count as one extra features = np.zeros(shape=(nframes,NFilt*3)) util.startprogress("FBANK Features") frameCnt=0 for k,v in sswi.iteritems(): # k: speakerId, v: dict with sentenceId speaker_=k for k2,v2 in v.iteritems(): # k2: sentenceId, v: dict with frameId -> entry sent_=k2 r,sig=getWavFile(speaker_,sent_) fbank_frames,energy=fbank(sig,r,winlen=winlen,winstep=winstep,nfilt=NFilt-1) fbank_frames=np.log(np.append(np.reshape(energy,(energy.shape[0],1)),fbank_frames,axis=1)) delta_1 = computeDelta(fbank_frames,N=NDelta) delta_2 = computeDelta(delta_1,N=NDelta) util.progress(float(frameCnt)/nframes*100.0) for i in range(len(v2)): frameCnt+=1 features[v2[i+1],:NFilt]=fbank_frames[i,:] features[v2[i+1],NFilt:2*NFilt]=delta_1[i,:] features[v2[i+1],2*NFilt:]=delta_2[i,:] util.endprogress() return features
def energy_feature(sig,rate): ''' this function is used to get the statistic energy value output features including: 1. log of average energy of all frames 2. log of maximum ... 3. log of minimum ... 4. log of varience ... INPUT: fbank_feat2 (FRAMENUM, ) OUTPUT: ave_energy(1, ) max_energy(1, ) min_energy(1, ) var_energy(1, ) ''' [fbank_feat1, fbank_feat2] = fbank(sig,rate) ave_energy = np.log(np.mean(fbank_feat2)) max_energy = np.log(np.max(fbank_feat2)) min_energy = np.log(np.min(fbank_feat2)) var_energy = np.log(np.var(fbank_feat2)) return [ave_energy, max_energy, min_energy, var_energy]
def filter(samplerate, signal, winlen=0.02, winstep=0.01, nfilt=40, nfft=512, lowfreq=100, highfreq=5000, preemph=0.97): """extracts mel filterbank energies from a given signal Args: samplerate (int): samples taken per second signal(1d numpy array): sample values winlen(float): sliding window size in seconds winstep(float): overlap of sliding windows in seconds nfilt(int): number of mel filters to apply nfft(int): size of the discrete fourier transform to use lowfreq(int): lowest frequency to collect highfreq(int): highest frequency to collect preemph(float): preemphesis factor Returns: feat(2d numpy array): filterbank energies """ feat, energy = speechfeatures.fbank(np.array(signal), samplerate, winlen=winlen, winstep=winstep, nfilt=nfilt, nfft=nfft, lowfreq=lowfreq, highfreq=highfreq, preemph=preemph) return np.swapaxes(feat, 0, 1)
def computeLogMelFilterBank(self, file_name): ''' Compute the log-mel frequency filterbank feature vector with deltas and double deltas ''' (rate, sig) = wav.read(file_name) fbank_feat, energy = fbank(sig, rate, winlen=0.025, winstep=0.01, nfilt=40) fbank_feat = np.log(fbank_feat) fbank_feat = np.vstack( (fbank_feat.transpose(), energy.transpose())).transpose() deltas = self.computeDeltas(fbank_feat) assert deltas.shape == fbank_feat.shape, "Shapes not equal {0} and \ {1}".format(deltas.shape, fbank_feat.shape) feat_vec = np.vstack((fbank_feat.transpose(), deltas.transpose())) double_deltas = self.computeDeltas(deltas) feat_vec = np.vstack((feat_vec, double_deltas.transpose())).transpose() assert len( feat_vec[0] ) == 123, "Something wrong with feature vector dimensions..." return feat_vec
stop = int(cur_line[1]) # audio = audlab.wavread(wave_name,(freq*stop/10**7-freq*start/10**7)) # data = audio[0] label = cur_line[2] audio = f.read_frames(freq * stop / 10**7 - freq * start / 10**7) if label in label_dic: mono_signal = audio #audio[:,0] energy = np.sum(mono_signal**2, 0) / len(mono_signal) signal = mono_signal #mono_signal/math.sqrt(energy) samplerate = f.samplerate # mfcc = get_mfcc(signal,samplerate,winstep=window_step,nfft=2048,highfreq=8000,lowfreq=100) feat, energy = fbank(signal, samplerate, winstep=window_step, nfft=2048, lowfreq=100, highfreq=22050, nfilt=40) feat = np.log(feat) feat = np.concatenate((feat, energy[:, np.newaxis]), 1) # d1_mfcc = mfcc_der_1() L = (stop - start) / 10.0**7 N_iter = len(feat) / N #math.floor(L/window_step/N) # apply context window if (L / window_step) > N: mfcc_matrix = np.zeros((1, 41 * N)) for k in range(int(N_iter)): mfcc_vec = [] for kk in range(N): mfcc_vec = np.concatenate(
for j in xrange(len(lines)): try: cur_line = lines[j].split() start = int(cur_line[0]) stop = int(cur_line[1]) # audio = audlab.wavread(wave_name,(freq*stop/10**7-freq*start/10**7)) # data = audio[0] label = cur_line[2] audio = f.read_frames(freq*stop/10**7-freq*start/10**7) if label in label_dic: mono_signal = audio#audio[:,0] energy = np.sum(mono_signal**2,0)/len(mono_signal) signal = mono_signal#mono_signal/math.sqrt(energy) samplerate = f.samplerate # mfcc = get_mfcc(signal,samplerate,winstep=window_step,nfft=2048,highfreq=8000,lowfreq=100) feat,energy = fbank(signal,samplerate,winstep=window_step,nfft=2048,lowfreq=100,highfreq=22050,nfilt=40) feat = np.log(feat) feat = np.concatenate((feat,energy[:,np.newaxis]),1) # d1_mfcc = mfcc_der_1() L = (stop-start)/10.0**7 N_iter = len(feat)/N#math.floor(L/window_step/N) # apply context window if (L/window_step)>N: mfcc_matrix = np.zeros((1,41*N)) for k in range(int(N_iter)): mfcc_vec = [] for kk in range(N): mfcc_vec = np.concatenate((mfcc_vec,feat[k*N+kk,:])) mfcc_matrix = np.concatenate((mfcc_matrix,mfcc_vec[np.newaxis,:])) else: print('Input data sequence does not match minimal length requirement: ignoring')
def log_energy_lags(num, window=0.1): rate, audio = load_audio(num) _, energy = fbank(audio, rate, winlen=window, winstep=window) energy = np.log(energy) return make_lags(energy, num, framesTR=int(2.0/window))
audio = f.read_frames(freq*stop/10**7-freq*start/10**7) if label in label_dic: mono_signal = audio#audio[:,0] energy = np.sum(mono_signal**2,0)/len(mono_signal) signal = mono_signal#mono_signal/math.sqrt(energy) samplerate = f.samplerate L = (stop-start)/10.0**7 # N_iter = int(np.floor(L/(window_step*N))) # apply context window if L>(window_step*N): # mfcc_matrix = np.zeros((1,numcep*N)) data_conc = np.zeros((1,nfilt)) for k in range(int(np.floor((L-N*window_step)*samplerate/(window_step*samplerate)))): feat = np.zeros((N,26)) audio_data = audio[k*window_step*samplerate:k*window_step*samplerate+(N+1)*window_step*samplerate] features, energy = fbank(audio_data,samplerate=samplerate,winstep=window_step,nfilt=nfilt,lowfreq=100,highfreq=8000) interm = np.sum(features,1) index = np.argsort(interm) data = np.log(features[index[-1],:]) data = dct(data)[0:nfilt] data[0] = np.log(energy[index[-1]]) # replace first cepstral coefficient with log of frame energy data_conc = np.concatenate((data_conc,data[np.newaxis,:]),0) else: print('Input data sequence does not match minimal length requirement: ignoring') # get the numeric label corresponding to the literal label num_label = label_dic[label]*np.ones(len(data_conc)-1) label_vector = np.append(label_vector,num_label.astype(np.float32,copy=False)) data_vector = np.concatenate(((data_vector,data_conc[1:,:].astype(np.float32,copy=False))),0) for k in range(len(label_dic)): if label_dic[label]==k: time_per_occurrence_class[k].append((stop-start)/(10.0**7))
def dicGen(self, audioPath): # the sub routine in featureDicGen featureDic = {} # count = 0 # for test for item in os.listdir(audioPath): # count += 1 # if count > 4: # break # debug(item,"item") if item[-4:] != ".wav": continue tempVec = [] # the feature vector of current clip being processed wavepath = os.path.join(audioPath, item) # print "processing " + wavepath rate, sig = wav.read(wavepath) mfcc_feat = mfcc(sig,rate) mfcc_feat = mfcc_feat[:, 1:13] # print mfcc_feat.shape # debug(mfcc_feat, "mfcc_feat") # debug(mfcc_feat[0], "mfcc_feat[0]") # break delta_mfcc = readAudio.deltacal(self, mfcc_feat) # print delta_mfcc.shape # debug(delta_mfcc, "delta_mfcc") # debug(delta_mfcc[0], "delta_mfcc[0]") # print type(delta_mfcc[0,0]) # break deltadelta_mfcc = readAudio.deltadelta(self, delta_mfcc) # debug(deltadelta_mfcc, "deltadelta_mfcc") # print deltadelta_mfcc.shape # break mfcc_feat = np.mean(mfcc_feat, axis = 0) # print mfcc_feat.shape # break delta_mfcc = np.mean(delta_mfcc, axis = 0) deltadelta_mfcc = np.mean(deltadelta_mfcc, axis = 0) # take means of all windows in each dim for mfcc, delta_mfcc and deltadelta_mfcc fbank_feat, energy = fbank(sig,rate) # debug(energy, "energy") fbank_feat = np.mean(fbank_feat[:, 1:13], axis = 0) # debug(energy, "energy") energy_vec = [] energyarray = np.asarray(energy) energy_vec.append(np.mean(energyarray, axis = 0)) energy_vec.append(np.median(energyarray, axis = 0)) energy_vec.append(np.std(energyarray, axis = 0)) energy_vec.append(np.amax(energyarray, axis = 0)) energy_vec.append(np.amin(energyarray, axis = 0)) energy_vec = np.asarray(energy_vec) # debug(energy_vec, "test") # take the mean, dedian, standard diviation, max and min of the energy, 5 dim in all, into the feature vector # print len(delta_mfcc) tempVec.extend(mfcc_feat) # 12 tempVec.extend(delta_mfcc)#12 tempVec.extend(deltadelta_mfcc) #12 tempVec.extend(fbank_feat) # 12 tempVec.extend(energy_vec) # 5 clip_name = item.split('.')[0] # name of the clip, as the key in the dictionary featureDic[clip_name] = tempVec # print len(tempVec) # break return featureDic