def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies return feat,energy
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between seccessive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 20. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def ssc(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Spectral Subband Centroid features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: A numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) pspec = numpy.where(pspec == 0,numpy.finfo(float).eps,pspec) # if things are all zeros we get problems fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies R = numpy.tile(numpy.linspace(1,samplerate/2,numpy.size(pspec,1)),(numpy.size(pspec,0),1)) return numpy.dot(pspec*R,fb.T) / feat
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97): """Compute Mel-filterbank energy features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :returns: 2 values. The first is a numpy array of size (NUMFRAMES by nfilt) containing features. Each row holds 1 feature vector. The second return value is the energy in each frame (total energy, unwindowed) """ highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def fbank(signal,samplerate=16000,winlen=0.025,winstep=0.01, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97, winfunc=lambda x:numpy.ones((x,))): highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate, winfunc) pspec = sigproc.powspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = numpy.dot(pspec,fb.T) # compute the filterbank energies feat = numpy.where(feat == 0,numpy.finfo(float).eps,feat) # if feat is zero, we get problems with log return feat,energy
def mfcc(signal,samplerate=16000,winlen=0.025,winstep=0.01,numcep=13, nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97,ceplifter=22,appendEnergy=True): """Compute MFCC features from an audio signal. :param signal: the audio signal from which to compute features. Should be an N*1 array :param samplerate: the samplerate of the signal we are working with. :param winlen: the length of the analysis window in seconds. Default is 0.025s (25 milliseconds) :param winstep: the step between successive windows in seconds. Default is 0.01s (10 milliseconds) :param numcep: the number of cepstrum to return, default 13 :param nfilt: the number of filters in the filterbank, default 26. :param nfft: the FFT size. Default is 512. :param lowfreq: lowest band edge of mel filters. In Hz, default is 0. :param highfreq: highest band edge of mel filters. In Hz, default is samplerate/2 :param preemph: apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97. :param ceplifter: apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22. :param appendEnergy: if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy. :returns: A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector. """ # In fbank changed to do things on unique part of spectrum only i.e from frequency bins 1 to nfft/2+1 # change in sigproc to use hamming window by default #MAKE SURE THAT nfft is even or next power of two after window length...in particular use something as NFFT=2^(ceil(log(winpts)/log(2))); #feat,energy = fbank(signal,samplerate,winlen,winstep,nfilt,nfft,lowfreq,highfreq,preemph) #K = nfft/2 + 1 # unique part of spectrum 0 to nfft/2 -- Already taken care of by numpy.fft.rfft -- returns unique part only highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,'hamm') pspec = sigproc.powspec(frames,nfft) # in this power spectrum computation normalization has been done..check 1/nfft factor..removed as of now mspec = sigproc.magspec(frames,nfft) energy = numpy.sum(pspec,1) # this stores the total energy in each frame energy = numpy.where(energy == 0,numpy.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) # filter bank returned here is nfilt by nfft/2 + 1 featx = numpy.dot(pspec,fb.T) # compute the filterbank energies featx = numpy.where(featx == 0,numpy.finfo(float).eps,featx) # if feat is zero, we get problems with log feat = numpy.log(featx) logmelspec = feat feat = dct(feat, type=2, axis=1, norm='ortho')[:,:numcep] feat = lifter(feat,ceplifter) if appendEnergy: feat[:,0] = numpy.log(energy) # replace first cepstral coefficient with log of frame energy return feat,mspec,logmelspec
def direccion_audio_clicked(self): self.inputDireccion = str(self.lineEdit.text()) if not self.inputDireccion: print "No puso direccion del Audio" else: print "La direccion es ", self.inputDireccion direccion = str(self.inputDireccion) (self.rate , self.sig ) = wav.read(direccion) print 'Data cargada' mfcc_feat = mfcc(self.sig,self.rate) powspec_result = powspec(mfcc_feat, 1) for i in xrange(0, powspec_result.size): if powspec_result[i][0] < 251 and powspec_result[i][0]>100: self.normal = self.normal+1 elif powspec_result[i][0] < 371 and powspec_result[i][0] > 251: self.tristes = self.tristes+1 elif powspec_result[i][0] < 482 and powspec_result[i][0] > 371: self.feliz = self.feliz+1 elif powspec_result[i][0] < 650 and powspec_result[i][0] > 482: self.enojado = self.enojado +1 if self.feliz > self.normal and self.feliz > self.enojado and self.feliz > self.tristes: self.ruta = '../Resultado/feliz1.jpg' if self.normal > self.feliz and self.normal > self.enojado and self.normal > self.tristes: self.ruta = '../Resultado/normal1.jpg' if self.enojado > self.feliz and self.enojado > self.normal and self.enojado > self.tristes: self.ruta = '../Resultado/enojado1.jpg' if self.tristes > self.feliz and self.tristes > self.enojado and self.tristes > self.normal: self.ruta = '../Resultado/triste.jpg' print 'enojado: ',self.enojado print 'triste: ',self.tristes print 'feliz: ',self.feliz print 'normal: ',self.normal self.enojado = 0 self.feliz = 0 self.tristes = 0 self.normal = 0
from features import mfcc from features import logfbank import scipy.io.wavfile as wav import matplotlib.pyplot as plt from features.sigproc import preemphasis, framesig, magspec, powspec (rate,sig) = wav.read('../Data/roycer/roycer.wav') mfcc_feat = mfcc(sig,rate) fbank_feat = logfbank(sig, rate) magspec_result = magspec(mfcc_feat,1) powspec_result = powspec(mfcc_feat, 1) sig2 = preemphasis(sig,0.95) print mfcc_feat plt.hist(mfcc_feat) #print(mfcc_feat[0:12,0:12]) #print(fbank_feat[0:12,0:12]) #print magspec_result print powspec_result enojado = 0 feliz = 0 tristes = 0
def logFilterbankFeatures(signal, samplerate=16000, winlen=0.0255, winstep=0.01, nfilt=40, nfft=512, lowfreq=133.3333, highfreq=6855.4976, preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq = highfreq or samplerate / 2 signal = sigproc.preemphasis(signal, preemph) frames = sigproc.framesig(signal, winlen * samplerate, winstep * samplerate, winfunc=hamming) pspec = sigproc.powspec(frames, nfft) energy = np.sum(pspec, 1) # this stores the total energy in each frame energy = np.where(energy == 0, np.finfo(float).eps, energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt, nfft, samplerate, lowfreq, highfreq) feat = np.dot(pspec, fb.T) # compute the filterbank energies feat = np.where(feat == 0, np.finfo(float).eps, feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy), np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat, deltaFeat, accFeat), axis=1)
def logFilterbankFeatures(signal,samplerate=16000,winlen=0.0255,winstep=0.01, nfilt=40,nfft=512,lowfreq=133.3333,highfreq=6855.4976,preemph=0.97, winSzForDelta=2): ''' Computes log filterbank energies on a mel scale + total energy using with the code taken from features.fbank, which does not accept window function as a param. function from package 'python_speech_features', see http://python-speech-features.readthedocs.org/en/latest/ or https://github.com/jameslyons/python_speech_features Therefore it calculates the FFT of the signal and sums the the weighted bins, distributed on a mel scale. Weighting is done with tri-angular filters. For these filter energies + total energy, deltas are calculated. :parameters: - signal : np.ndarray, dtype=float input vector of the speech signal - samplerate : int - winlen: float length of analysis window in seconds - winstep: float step size between successive windows in seconds - nfilt: int number of filter energies to compute (total energy not included). e.g. 40 --> Output dim = (40+1)*3 - nfft: int FFT size - lowfreq: int lower end on mel frequency scale, on which filter banks are distributed - highfreq: int upper end on mel frequency scale, on which filter banks are distributed - preemph: float pre-emphasis coefficient - deltafeat: np.ndarray, dtype=float deltas of the input features - winSzForDelta: int window size for computing deltas. E.g. 2 --> t-2, t-1, t+1 and t+2 are for calculating the deltas :returns: - features: numpy.array: float feature-matrix. 1st dimension: time steps of 'winstep', 2nd dim: feature dimension: (nfilt + 1)*3, +1 for energy, *3 because of deltas ''' # Part of the following code is copied from function features.fbank # Unfortunately, one can't specify the window function in features.fbank # Hamming window is used here highfreq= highfreq or samplerate/2 signal = sigproc.preemphasis(signal,preemph) frames = sigproc.framesig(signal, winlen*samplerate, winstep*samplerate,winfunc=hamming) pspec = sigproc.powspec(frames,nfft) energy = np.sum(pspec,1) # this stores the total energy in each frame energy = np.where(energy == 0,np.finfo(float).eps,energy) # if energy is zero, we get problems with log fb = features.get_filterbanks(nfilt,nfft,samplerate,lowfreq,highfreq) feat = np.dot(pspec,fb.T) # compute the filterbank energies feat = np.where(feat == 0,np.finfo(float).eps,feat) # if feat is zero, we get problems with log # Use log feature bank and log energy feat = np.column_stack((np.log(energy),np.log(feat))) # calculate delta and acceleration deltaFeat = delta(feat, winSzForDelta) accFeat = delta(deltaFeat, winSzForDelta) # stack features + delta + acceleration return np.concatenate((feat,deltaFeat,accFeat),axis=1)