def _key_fnc( sample: NDArray[Float32], frequency_rate: int, windowfnc: Window, key_type: KeyFunction, ): """ This function computes the key function, which in return calculates the keys for the [this.samples] map. To calculate the spectral centroid, the frequency_rate should be equal to the half of the samplerate. """ if key_type == KeyFunction.CENTROID: return _get_centroid( sample, estd.Centroid(range=frequency_rate), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MAX: return _get_max( sample, estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MFCC: return _get_mfcc( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS: return _get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS_LOG: return estd.UnaryOperator(type="log")(_get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), )) raise ValueError("Keyfunction is not defined!")
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512): """ extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname. Inputs: fname -- is the name of audio file. outpath -- is the output path of processed files. fs -- is the sampling frequency (Hz). fsize -- is the size of each frame. hsize -- is the hop size betwean frames. Outputs: the file contains the mfcc coefficents of audio file. in what format??? """ # gate(fname) loader = es.MonoLoader(filename=fname, sampleRate=fs) # length = len(loader) # maxim = max(loader) # for sample in loader: # if abs(sample) < maxim/20: # sample = 0 ; w = es.Windowing(type='hann') spectrum = es.Spectrum() mfcc = es.MFCC(inputSize=513, numberCoefficients=20) mfccs = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) return mfcc
def getMBE(audio): ''' mel band energy feature :param audio: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfccBands = [] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) mfccBands.append(bands) feature = np.array(mfccBands) return feature
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T mfccs = essentia.array(mfccs).T # and plot plt.imshow(mfccs[1:, :], aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def essentiaObjectInit(self): winAnalysis = 'hann' self.MFCC80 = ess.MFCC(sampleRate=self.fs, highFrequencyBound=self.highFrequencyBound, inputSize=self.frameSize + 1, numberBands=self.numberBands) N = 2 * self.frameSize # padding 1 time framesize self.SPECTRUM = ess.Spectrum(size=N) self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22): """ Get MFCCs 'the HTK way' with the help of Essentia https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers) Parameters ---------- window_length: int Length of the window to use for the STFT nmfcc: int Number of MFCC coefficients to compute n_mels: int Number of frequency bands to use fmax: int Maximum frequency Returns ------- ndarray(nmfcc, nframes) An array of all of the MFCC frames """ fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2)))) spectrumSize= fftlen//2+1 zeroPadding = fftlen - window_length w = estd.Windowing(type = 'hamming', # corresponds to htk default USEHAMMING = T size = window_length, zeroPadding = zeroPadding, normalized = False, zeroPhase = False) spectrum = estd.Spectrum(size=fftlen) mfcc_htk = estd.MFCC(inputSize = spectrumSize, type = 'magnitude', # htk uses mel filterbank magniude warpingFormula = 'htkMel', # htk's mel warping formula weighting = 'linear', # computation of filter weights done in Hz domain highFrequencyBound = fmax, # 8000 is htk default lowFrequencyBound = 0, # corresponds to htk default numberBands = n_mels, # corresponds to htk default NUMCHANS = 26 numberCoefficients = nmfcc, normalize = 'unit_max', # htk filter normaliation to have constant height = 1 dctType = 3, # htk uses DCT type III logType = 'log', liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) return np.array(mfccs, dtype=np.float32).T
def audio_features(audio_win): """ returns audio features for a win """ if audio_win.shape[0] % 2 == 1: audio_win = audio_win[:-1] spectrum = esst.Spectrum(size=audio_win.shape[0])(audio_win) _bands, mfcc = esst.MFCC(inputSize=spectrum.shape[0], sampleRate=SR)(spectrum) rhythm = esst.RhythmDescriptors()(audio_win) return mfcc.tolist() + [rhythm[2]] + list(rhythm[5:11])
def getFeature(audio, d=True, nbf=False): ''' MFCC of give audio interval [p[0],p[1]] :param audio: :param p: :return: ''' winAnalysis = 'hann' # this MFCC is for pattern classification, which numberBands always be by default MFCC40 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC40(mXFrame) # mfccFrame = mfccFrame[1:] mfcc.append(mfccFrame) if d: mfcc = np.array(mfcc).transpose() dmfcc = Fdeltas(mfcc, w=5) ddmfcc = Fdeltas(dmfcc, w=5) feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc))) else: feature = np.array(mfcc) if not d and nbf: mfcc = np.array(mfcc).transpose() mfcc_out = np.array(mfcc, copy=True) for w_r in range(1, 6): mfcc_right_shifted = Fprev_sub(mfcc, w=w_r) mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r) mfcc_out = np.vstack( (mfcc_out, mfcc_left_shifted, mfcc_right_shifted)) feature = np.array(np.transpose(mfcc_out), dtype='float32') # print feature.shape return feature
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) #frame_energy = energy_func(frame) #mfccs.append(numpy.append(mfcc_coeffs, frame_energy)) mfccs.append(mfcc_coeffs) return mfccs
def extractor(filename): frameSize = 1024 hopSize = 512 fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() w = ess.Windowing(type='hamming', normalized=False) # make sure these are same for MFCC and IDCT computation NUM_BANDS = 26 DCT_TYPE = 2 LIFTERING = 0 NUM_MFCCs = 13 spectrum = ess.Spectrum() mfcc = ess.MFCC( numberBands=NUM_BANDS, numberCoefficients= NUM_MFCCs, # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be weighting= 'linear', # computation of filter weights done in Hz domain (optional) normalize= 'unit_max', # htk filter normaliation to have constant height = 1 (optional) dctType=DCT_TYPE, logType='log', liftering=LIFTERING) # corresponds to htk default CEPLIFTER = 22 idct = ess.IDCT(inputSize=NUM_MFCCs, outputSize=NUM_BANDS, dctType=DCT_TYPE, liftering=LIFTERING) all_melbands_smoothed = [] for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): spect = spectrum(w(frame)) melbands, mfcc_coeffs = mfcc(spect) melbands_smoothed = np.exp( idct(mfcc_coeffs)) # inverse the log taken in MFCC computation all_melbands_smoothed.append(melbands_smoothed) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T all_melbands_smoothed = essentia.array(all_melbands_smoothed).T # and plot plt.imshow(all_melbands_smoothed, aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def feature_extractor_standard(audio_in, frameSize, hopSize, aggLen): #print('Starting Feature Extraction for %s',filename) #creating algorithm objects and pool objects win=es.Windowing() spec=es.Spectrum() centroid = es.Centroid() flatness = es.Flatness() mfcc=es.MFCC(lowFrequencyBound=40) pitchYin = es.PitchYinFFT() #Compute features frame by frame mfcc_ftrsArray = [] sCentroidArray = [] sFlatnessArray = [] pConfArray = [] for frame in es.FrameGenerator(audio_in, frameSize = frameSize, hopSize = hopSize): spectrum = spec(win(frame)) band_eneg, mfcc_ftrs=mfcc(spectrum) sCentroid = centroid(spectrum) sFlatness = flatness(spectrum) pitch, pitchConf = pitchYin(spectrum) #sFlux = flux(spectrum) mfcc_ftrsArray.append(mfcc_ftrs) sCentroidArray.append(sCentroid) sFlatnessArray.append(sFlatness) pConfArray.append(pitchConf) meanMFCC = [] varMFCC = [] meanCent = [] varCent = [] meanFlat = [] varFlat = [] meanPConf = [] varPConf = [] for ii in xrange(0, len(mfcc_ftrsArray)-aggLen,aggLen): meanMFCC.append(np.mean(mfcc_ftrsArray[ii:ii+aggLen],axis=0)) varMFCC.append(np.var(mfcc_ftrsArray[ii:ii+aggLen],axis=0)) meanCent.append(np.mean(sCentroidArray[ii:ii+aggLen])) varCent.append(np.var(sCentroidArray[ii:ii+aggLen])) meanFlat.append(np.mean(sFlatnessArray[ii:ii+aggLen])) varFlat.append(np.var(sFlatnessArray[ii:ii+aggLen])) meanPConf.append(np.mean(pConfArray[ii:ii+aggLen])) varPConf.append(np.var(pConfArray[ii:ii+aggLen])) return np.concatenate((np.array(meanMFCC), np.array(varMFCC), np.transpose(np.array(meanCent, ndmin=2)), np.transpose(np.array(varCent, ndmin=2)), np.transpose(np.array(meanFlat,ndmin=2)), np.transpose(np.array(varFlat,ndmin=2)), np.transpose(np.array(meanPConf,ndmin=2)), np.transpose(np.array(varPConf,ndmin=2))),axis=1)
def compute_beatsync_features(ticks, audio): """Computes the HPCP and MFCC beat-synchronous features given a set of beats (ticks).""" MFCC = STFTFeature(FRAME_SIZE, HOP_SIZE, WINDOW_TYPE, ES.MFCC(numberCoefficients=14), ticks, SAMPLE_RATE) HPCP = STFTFeature(FRAME_SIZE, HOP_SIZE, WINDOW_TYPE, ES.HPCP(), ticks, SAMPLE_RATE) logging.info("Computing Beat-synchronous MFCCs...") mfcc = MFCC.compute_features(audio) logging.info("Computing Beat-synchronous HPCPs...") hpcp = HPCP.compute_features(audio) logging.info("Computing Beat-synchronous Tonnetz...") tonnetz = utils.chroma_to_tonnetz(hpcp) return mfcc.tolist(), hpcp.tolist(), tonnetz.tolist()
def getMFCCBands1D(audio, nbf=False): ''' mel bands feature [p[0],p[1]], this function only for pdnn acoustic model training output feature is a 1d vector it needs the array format float32 :param audio: :param p: :param nbf: bool, if we need to neighbor frames :return: ''' winAnalysis = 'hann' MFCC80 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1, numberBands=80) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC80(mXFrame) mfcc.append(bands) if nbf: mfcc = np.array(mfcc).transpose() mfcc_right_shifted_1 = Fprev_sub(mfcc, w=1) mfcc_left_shifted_1 = Fprev_sub(mfcc, w=-1) mfcc_right_shifted_2 = Fprev_sub(mfcc, w=2) mfcc_left_shifted_2 = Fprev_sub(mfcc, w=-2) feature = np.transpose( np.vstack((mfcc, mfcc_right_shifted_1, mfcc_left_shifted_1, mfcc_right_shifted_2, mfcc_left_shifted_2))) else: feature = mfcc # the mel bands features feature = np.array(feature, dtype='float32') return feature
def getMFCCBands2D(audio, framesize, nbf=False, nlen=10): ''' mel bands feature [p[0],p[1]] output feature for each time stamp is a 2D matrix it needs the array format float32 :param audio: :param p: :param nbf: bool, if we need to neighbor frames :return: ''' winAnalysis = 'hann' MFCC80 = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1, numberBands=80) N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs] for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) bands, mfccFrame = MFCC80(mXFrame) mfcc.append(bands) if nbf: mfcc = np.array(mfcc).transpose() mfcc_out = np.array(mfcc, copy=True) for ii in range(1, nlen + 1): mfcc_right_shift = Fprev_sub(mfcc, w=ii) mfcc_left_shift = Fprev_sub(mfcc, w=-ii) mfcc_out = np.vstack((mfcc_right_shift, mfcc_out, mfcc_left_shift)) feature = mfcc_out.transpose() else: feature = mfcc # the mel bands features feature = np.array(feature, dtype='float32') return feature
def _calculate_features_for_audio(self, audio): FRAME_SIZE, HOP_SIZE = 2048, 1024 features = [] low_f = 100 high_f = 7000 w = ess.Windowing(type='hann') spec = ess.Spectrum(size=FRAME_SIZE) mfcc = ess.MFCC(lowFrequencyBound=low_f, highFrequencyBound=high_f) spectralContrast = ess.SpectralContrast(lowFrequencyBound=low_f, highFrequencyBound=high_f) pool = essentia.Pool() for frame in ess.FrameGenerator(audio, frameSize=FRAME_SIZE, hopSize=HOP_SIZE): frame_spectrum = spec(w(frame)) spec_contrast, spec_valley = spectralContrast(frame_spectrum) mfcc_bands, mfcc_coeff = mfcc(frame_spectrum) pool.add('spec_contrast', spec_contrast) pool.add('spec_valley', spec_valley) pool.add('mfcc_coeff', mfcc_coeff) def add_moment_features(array): avg = np.average(array, axis=0) std = np.std(array, axis=0) skew = scipy.stats.skew(array, axis=0) deltas = array[1:, :] - array[:-1, :] avg_d = np.average(deltas, axis=0) std_d = np.std(deltas, axis=0) features.extend(avg) features.extend(std) features.extend(skew) features.extend(avg_d) features.extend(std_d) add_moment_features(pool['spec_contrast']) add_moment_features(pool['spec_valley']) add_moment_features(pool['mfcc_coeff']) return np.array(features, dtype='single')
def mfcc(x, M=WINDOW_SIZE_MFCC, N=FFT_SIZE_MFCC, H=HOP_SIZE_MFCC, fs=SR, window_type=WINDOW_TYPE_MFCC, n_mfcc=N_MFCC): ''' -extract features from audio file -Features: MFCC (24 COEFFS) ''' #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs) #create essentia instances x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) mfcc = ess.MFCC(numberCoefficients=n_mfcc, inputSize=int(N / 2 + 1), sampleRate=fs, highFrequencyBound=int(fs / 2 - 1)) #init vectors MFCC = [] #compute features for every stft frame for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft mfcc_bands, mfcc_coeffs = mfcc(mX) MFCC.append(mfcc_coeffs) #convert into numpy matrices MFCC = essentia.array(MFCC) return MFCC
def _get_features(audio_path): spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') mfcc = ess.MFCC(numberCoefficients = 12) x = ess.MonoLoader(filename=audio_path, sampleRate = fs)() mfccs = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) headers = [] features = [] for i in range(0, 12): coefficients = mfccs[:,i] headers.append('mean_mfcc_%d' % i) features.append(np.mean(coefficients)) # plt.figure(1, figsize=(9.5, 7)) # plt.subplot(2,1,1) # plt.plot(np.arange(x.size)/float(fs), x, 'b') # plt.axis([0, x.size/float(fs), min(x), max(x)]) # plt.ylabel('amplitude') # plt.title('x (speech-male.wav)') # plt.subplot(2,1,2) # numFrames = int(mfccs[:,0].size) # frmTime = H*np.arange(numFrames)/float(fs) # plt.pcolormesh(frmTime, 1+np.arange(12), np.transpose(mfccs[:,1:])) # plt.ylabel('coefficients') # plt.title('MFCCs') # plt.autoscale(tight=True) # plt.tight_layout() # plt.savefig('mfcc.png') # plt.show() return headers, features
def get_mfcc(frames, sample_rate=16000, num_bands=64, num_coeffs=32, window_type='hann'): ''' Calculates amplitude spectrum, mel-frequency spectrum and mel-frequency cepstral coefficients. Parameters: frames : overlapping signal frames for short-time analysis sample_rate : audio sampling rate, num_bands : number of mel-frequency bands num_coeffs : number of mel-freq cepstrum coefficients window_type : type of windowing function to apply Returns three 2D numpy arrays: amplitude spectra, mel-freq spectra and MFCCs ''' frame_size = len(frames[0]) spectra = [] melbands = [] mfccs = [] spectrum_estimator = es.Spectrum(size=frame_size) windowing = es.Windowing(type='hann', size=frame_size) mfcc_estimator = es.MFCC(numberBands=num_bands, numberCoefficients=num_coeffs + 1, inputSize=frame_size, sampleRate=sample_rate, highFrequencyBound=8000) for frame in frames: spectrum = spectrum_estimator(windowing(frame)) mfcc_bands, mfcc_coeffs = mfcc_estimator(spectrum) spectra.append(spectrum) mfccs.append(mfcc_coeffs[1:]) melbands.append(mfcc_bands) return np.array(spectra).T, np.array(melbands).T, np.array(mfccs).T
def compute_features(audio, beats=None): """Computes the HPCP and MFCC beat-synchronous features given a set of beats (beats).""" beatsync_str = "" if beats is not None: beatsync_str = "Beat-synchronous " MFCC = STFTFeature(msaf.Anal.frame_size, msaf.Anal.hop_size, msaf.Anal.window_type, ES.MFCC(numberCoefficients=msaf.Anal.mfcc_coeff), msaf.Anal.sample_rate, beats) HPCP = STFTFeature(msaf.Anal.frame_size, msaf.Anal.hop_size, msaf.Anal.window_type, ES.HPCP(), msaf.Anal.sample_rate, beats) logging.info("Computing %sMFCCs..." % beatsync_str) mfcc = MFCC.compute_features(audio) logging.info("Computing %sHPCPs..." % beatsync_str) hpcp = HPCP.compute_features(audio) #plt.imshow(hpcp.T, interpolation="nearest", aspect="auto"); plt.show() logging.info("Computing %sTonnetz..." % beatsync_str) tonnetz = utils.chroma_to_tonnetz(hpcp) return mfcc, hpcp, tonnetz
def __init__(self, input_filename, fft_size, numMelBands): fft_size_dummy = 1024 window_function_dummy = np.hanning AudioProcessor.__init__(self, input_filename, fft_size_dummy, window_function_dummy) # self.inv_mfcc_transform = InvMFCC() # inverse mfcc transform # self.inv_mfcc_transform.setup() self.framesize = 2048 # # self.framesize = 1102 # default frame size in htk, at rate of 44100 zeroPadding = fft_size - self.framesize self.w = ess.Windowing( type='hamming', size=self.framesize, zeroPadding=zeroPadding, # normalized = False, zeroPhase=False) spectrumSize = fft_size // 2 + 1 self.spectrum = ess.Spectrum(size=fft_size) self.mfcc = ess.MFCC( inputSize=spectrumSize, # htk-like mfccs type='magnitude', warpingFormula='htkMel', weighting='linear', highFrequencyBound=8000, lowFrequencyBound=0, numberBands=numMelBands, numberCoefficients=InvMFCCAudioProcessor.NUM_MFCC_COEFFS, normalize='unit_max', dctType=3, logType='log', liftering=22) self.idct = ess.IDCT(inputSize=InvMFCCAudioProcessor.NUM_MFCC_COEFFS, outputSize=numMelBands, dctType=3, liftering=22)
def load_audio_excerpts(path=AUDIO_PATH, num_features=9): """ Extracts `num_features+1` MFCC coeffcients from each audio and discards the first coefficients (tied to energy). """ targets = np.zeros((3, 5, num_features)) out = np.zeros((3, 5, 4, num_features)) for file in tqdm(os.listdir(path)): if file.endswith(excerpt_search.FORMAT): audio = esst.EasyLoader(filename=os.path.join(path, file), sampleRate=SR)() if audio.shape[0] % 2 == 1: audio = audio[:-1] spectrum = esst.Spectrum(size=audio.shape[0])(audio) _bands, features = esst.MFCC(inputSize=spectrum.shape[0], sampleRate=SR, numberCoefficients=num_features + 1)(spectrum) splits = file.replace('.flac', '').split('_') question = int(splits[0][1]) _fill_out_targets(out[question], targets[question], features[1:], splits, 'target') return out - targets[..., np.newaxis, :]
print 'Labels and label indices', all_labels # This processing (top freq peaks) only works for single speaker case... need better features for multispeaker! # MFCC (or deep NN/automatic feature extraction) could be interesting inputSize = (data.shape[1] - 1) * 2 M = 1024 N = 1024 H = 256 fs = 8000 spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') mfcc = ess.MFCC(numberCoefficients=7, inputSize=inputSize / 2 + 1) sc = ess.SpectralContrast(frameSize=inputSize) cent = ess.Centroid() """n_dim = 6 all_obs = np.zeros((data.shape[0], n_dim)) for r in range(data.shape[0]): #obs = np.zeros((n_dim, 1)) _, t = peakfind(data[r, :], n_peaks=n_dim) all_obs[r, :] = t.copy() #all_obs = np.atleast_3d(all_obs)""" n_dim = 13 all_obs = np.zeros((data.shape[0], n_dim)) for r in range(data.shape[0]): mX = essentia.array(data[r, :])
M = 1024 N = 1024 H = 512 fs = 44100 help(ess.MFCC) spectrum = ess.Spectrum(size=N) #printInfo(spectrum) window = ess.Windowing(size=M, type='hann') #printInfo(window) mfcc = ess.MFCC(numberCoefficients=12, inputSize=N / 2 + 1) #printInfo(mfcc) x = ess.MonoLoader(filename='../../sounds/speech-female.wav', sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) print '-' * 70 mfccs = [] frameIndex = 0 for frame in frames: mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) print mfcc_bands print '-' * 70
from general.parameters import * from general.filePathHsmm import kerasScaler_path from general.Fprev_sub import Fprev_sub from audio_preprocessing import feature_reshape import essentia.standard as ess import pickle import numpy as np winAnalysis = 'hann' N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) MFCC = ess.MFCC(sampleRate=fs, highFrequencyBound=highFrequencyBound, inputSize=framesize + 1, numberBands=80) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize) def getMFCCBands2D(audio, framesize, hopsize, nbf=False, nlen=10): """ mel bands feature [p[0],p[1]] output feature for each time stamp is a 2D matrix it needs the array format float32 :param audio: :param p: :param nbf: bool, if we need to neighbor frames :return: """ mfcc = [] # audio_p = audio[p[0]*fs:p[1]*fs]
def main_simple(args): """main_simple Compute short time spectral feature map """ plt.ion() audio = loadaudio(args) w = estd.Windowing(type = 'hamming') spectrum = estd.Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum mfcc = estd.MFCC() specgram = [] mfccs = [] melbands = [] for frame in estd.FrameGenerator(audio, frameSize = args.frame_size_low_level, hopSize = args.frame_size_low_level, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) melbands.append(mfcc_bands) specgram.append(spectrum(w(frame))) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) mfccs = np.array(mfccs).T melbands = np.array(melbands).T specgram = np.array(specgram).T fig, gs = makefig(rows = 3, cols = 1, add_subplots = False) fig.show() print(("specgram.shape", specgram.shape)) print(("melbands.shape", melbands.shape)) ax1 = fig.add_subplot(gs[0,0]) ax1.imshow(np.log(specgram[1:,:]), aspect = 'auto', origin='lower', interpolation='none') ax2 = fig.add_subplot(gs[1,0]) ax2.imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none') ax3 = fig.add_subplot(gs[2,0]) ax3.imshow(np.log(melbands[1:,:]), aspect = 'auto', origin='lower', interpolation='none') plt.draw() plt.pause(1e-9) # process numcomps = 3 melbands_ = scale(melbands.T).T # wt = PCA(n_components = melbands.shape[0], whiten = True) # melbands_ = wt.fit_transform(melbands.T).T # scale(melbands.T).T # melbands_ = np.log(melbands + 1) * 10 print(("melbands", melbands.shape, "melbands_", melbands_.shape)) print(("means", np.mean(melbands_, axis = 1))) sfa_in = melbands_[1:,:] sfa_cov = np.cov(sfa_in) print(("sfa_cov", sfa_cov.shape)) # rbfcs = np.random.uniform(-5, 5, (numcomps, sfa_in.shape[0])) # sfa = SFA(numcomps = numcomps, numexps = 2) # , rbfc = rbfcs) sfa = KernelPCA(kernel="rbf", degree=5, fit_inverse_transform=True, gamma=10, n_components = numcomps) fig3, gs3 = makefig(rows = 1, cols = 2) fig3.axes[0].plot(sfa_in.T) fig3.axes[1].imshow(sfa_cov, aspect = 'auto', origin='upper', interpolation='none') fig3.axes[1].set_aspect(1) plt.draw() plt.pause(1e-9) try: # sfa_in += np.random.uniform(-1e-3, 1e-3, sfa_in.shape) melbands_sfa = sfa.fit_transform(sfa_in.T) # melbands_sfa = sfa.fit_transform(specgram[1:,:].T) print(("melbands_sfa.shape", melbands_sfa.shape)) fig2, gs2 = makefig(rows = 1, cols = 2, add_subplots = False) fig2.show() ax = fig2.add_subplot(gs2[0,0]) # ax.plot(melbands_sfa) # ax.imshow(np.log(melbands_sfa.T), aspect = 'auto', origin='lower', interpolation='none') # ax.imshow(np.log(np.abs(melbands_sfa.T)), aspect = 'auto', origin='lower', interpolation='none') ax.imshow(np.abs(melbands_sfa.T), aspect = 'auto', origin='lower', interpolation='none') ax = fig2.add_subplot(gs2[0,1]) maxs = [] for fr_ in melbands_sfa: print(("fr_", fr_.shape)) maxs.append(np.argmax(np.abs(fr_))) ax.plot(np.array(maxs), "bo") plt.draw() plt.pause(1e-9) except Exception as e: print(("SFA failed", e)) plt.ioff() plt.show()
def main_mfcc(args): """main_mfcc Compute short time windowed MFCC features for input waveform and plot them over time (mfcc-spectrogram) """ plt.ion() audio = loadaudio(args) print(("audio", type(audio), audio.shape)) # pylab contains the plot() function, as well as figure, etc... (same names as Matlab) plt.rcParams['figure.figsize'] = (15, 6) # set plot sizes to something larger than default fig, gs = makefig(rows = 2, cols = 2) w = estd.Windowing(type = 'hann') spectrum = estd.Spectrum() # FFT() would return the complex FFT, here we just want the magnitude spectrum mfcc = estd.MFCC() # print "w", repr(w) # print "spectrum", repr(spectrum) # print "mfcc", repr(mfcc) frame = audio[int(0.2*args.samplerate) : int(0.2*args.samplerate) + 1024] print(("frame.shape", frame.shape)) spec = spectrum(w(frame)) mfcc_bands, mfcc_coeffs = mfcc(spec) print(("type(spec)", type(spec))) print(("spec.shape", spec.shape)) fig.axes[0].plot(audio[int(0.2*args.samplerate):int(0.4*args.samplerate)]) fig.axes[0].set_title("This is how the 2nd second of this audio looks like:") # plt.show() # unnecessary if you started "ipython --pylab" fig.axes[1].plot(spec) fig.axes[1].set_title("The spectrum of a frame:") fig.axes[2].plot(mfcc_bands) fig.axes[2].set_title("Mel band spectral energies of a frame:") fig.axes[3].plot(mfcc_coeffs) fig.axes[3].set_title("First 13 MFCCs of a frame:") fig.show() # plt.show() # unnecessary if you started "ipython --pylab" ################################################################################ fig2, gs2 = makefig(rows = 2, cols = 2, add_subplots = False) mfccs = [] melbands = [] for frame in estd.FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) melbands.append(mfcc_bands) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) mfccs = np.array(mfccs).T melbands = np.array(melbands).T pool = e.Pool() for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512, startFromZero=True): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) ax1 = fig2.add_subplot(gs2[0,0]) ax1.imshow(pool['lowlevel.mfcc_bands'].T, aspect = 'auto', origin='lower', interpolation='none') ax1.set_title("Mel band spectral energies in frames") ax2 = fig2.add_subplot(gs2[0,1]) ax2.imshow(pool['lowlevel.mfcc'].T[1:,:], aspect='auto', origin='lower', interpolation='none') ax2.set_title("MFCCs in frames") # and plot ax3 = fig2.add_subplot(gs2[1,0]) ax3.imshow(melbands[:,:], aspect = 'auto', origin='lower', interpolation='none') ax3.set_title("Mel band spectral energies in frames") # show() # unnecessary if you started "ipython --pylab" ax4 = fig2.add_subplot(gs2[1,1]) ax4.imshow(mfccs[1:,:], aspect='auto', origin='lower', interpolation='none') ax4.set_title("MFCCs in frames") fig2.show() plt.ioff() plt.show() # unnecessary if you started "ipython --pylab"
import essentia as es import essentia.standard as ess import numpy as np import pickle import glob import utilFunctions as UF import scipy.spatial.distance as DS import parameters as params import csv rms=ess.RMS() window = ess.Windowing(type = "hamming") spec = ess.Spectrum(size=params.Nfft) zz = np.zeros((params.zeropadLen,), dtype = 'float32') genmfcc = ess.MFCC(highFrequencyBound = 22000.0, inputSize = params.Nfft/2+1, sampleRate = params.Fs) hps = ess.HighPass(cutoffFrequency = 240.0) onsets = ess.Onsets() strokeLabels = ['dha', 'dhen', 'dhi', 'dun', 'ge', 'kat', 'ke', 'na', 'ne', 're', 'tak', 'te', 'tit', 'tun'] taals = {"teen": {"nmatra": 16, "accents": np.array([4, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1])}, "ek": {"nmatra": 12, "accents": np.array([4, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1])}, "jhap": {"nmatra": 10, "accents": np.array([4, 1, 2, 1, 1, 3, 1, 2, 1, 1])}, "rupak": {"nmatra": 7, "accents": np.array([2, 1, 1, 3, 1, 3, 1])} } rolls = [{"bol": ['dha/dha_02', 'te/te_05', 're/re_04', 'dha/dha_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['te/te_02', 're/re_05', 'ke/ke_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['ge/ge_02', 'ge/ge_05', 'te/te_04', 'te/te_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])}, {"bol": ['ge/ge_02', 'ge/ge_05', 'dhi/dhi_04', 'na/na_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
# Spectral descriptors peak_freq = es.MaxMagFreq() roll_off = es.RollOff() flux = es.Flux() flatness = es.Flatness() # Harmonic descriptors pitch = es.PitchYin(frameSize=1024) spectral_peaks = es.SpectralPeaks(minFrequency=1e-5) harmonic_peaks = es.HarmonicPeaks() inharmonicity = es.Inharmonicity() oer = es.OddToEvenHarmonicEnergyRatio() tristimulus = es.Tristimulus() # MFCC mfcc = es.MFCC(inputSize=513) class Audio: def __init__(self, path): self.audio = es.MonoLoader(filename=str(path))() self.name = path.name self.pool = essentia.Pool() self._build_temporal_features() self._build_spectral_features() self._build_harmonic_features() self._build_mfcc() self._features = { 'audio_correlation': 'AC',
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = ess.ZeroCrossingRate() # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # spectral algorithms barkbands = ess.BarkBands(sampleRate=sampleRate) centralmoments = ess.CentralMoments() crest = ess.Crest() centroid = ess.Centroid() decrease = ess.Decrease() spectral_contrast = ess.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = ess.DistributionShape() energy = ess.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = ess.FlatnessDB() flux = ess.Flux() harmonic_peaks = ess.HarmonicPeaks() hfc = ess.HFC() mfcc = ess.MFCC() rolloff = ess.RollOff() rms = ess.RMS() strongpeak = ess.StrongPeak() # pitch algorithms pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = ess.PitchSalience() # dissonance spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = ess.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) #scPool = es.Pool() # pool for spectral contrast for frame in frames: frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate] # pool.setCurrentScope(frameScope) # silence rate # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum ** 2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) #scPool.add(namespace + '.' + 'scvalleys', sc_valleys) pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise ess.EssentiaError('This is a silent file!') #spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) # , pool.GlobalScope) # the code below is the same as the one above: # for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) # , pool.GlobalScope) progress.finish()
def mainFunction(filename,fs,framesize,hopsize,h2,alpha,p_lambda): ''' main procedure of algorithm :param filename: :param fs: :param framesize: :param hopsize: :return: ''' # load audio audio = ess.MonoLoader(filename = filename, sampleRate = fs)() # spectrogram init winAnalysis = 'hann' N = 2 * framesize # padding 1 time framesize SPECTRUM = ess.Spectrum(size=N) WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N-framesize) highFrequencyBound = fs/2 if fs/2<11000 else 11000 MFCC = ess.MFCC(sampleRate=fs,highFrequencyBound=highFrequencyBound) PEAK = ess.PeakDetection(interpolate=False,maxPeaks=99999) mfcc = [] mX = [] print 'calculating MFCC ... ...' for frame in ess.FrameGenerator(audio, frameSize=framesize, hopSize=hopsize): frame = WINDOW(frame) mXFrame = SPECTRUM(frame) mX.append(mXFrame) bands,mfccFrame = MFCC(mXFrame) mfccFrame = mfccFrame[1:] mfcc.append(mfccFrame) mX = np.array(mX) mX = np.transpose(mX) mfcc = np.array(mfcc) T = mfcc.shape[0] # time D = mfcc.shape[1] # feature dimension print 'calculating delta mfcc ... ...' d_mfcc = Fdeltas(mfcc.transpose(), w=9) d_mfcc = np.transpose(d_mfcc) # Spectral variation function SVF = np.sqrt(np.sum(d_mfcc**2.0,axis=1)) SVF = (SVF - np.min(SVF))/(np.max(SVF)-np.min(SVF)) # peaks and valleys p_SVF,a_SVF = PEAK(np.array(SVF,dtype=np.float32)) p_SVF = np.array(np.round(p_SVF*(T-1)),dtype=np.int) p_v_SVF,a_v_SVF = PEAK(np.array(1-SVF,dtype=np.float32)) p_v_SVF = np.array(np.round(p_v_SVF*(T-1)),dtype=np.int) # heuristics p_SVF,a_SVF,p_v_SVF,a_v_SVF = heuristics(p_SVF,a_SVF,p_v_SVF,a_v_SVF,SVF,fs,hopsize,h2,alpha) index2Delete = [] if len(p_SVF) > 3: # BIC ii = 1 jj = 1 # dynamic windowing BIC while ii < len(p_SVF)-1: p_0 = p_SVF[ii-jj] p_1 = p_SVF[ii] p_2 = p_SVF[ii+1] delta_ABF2 = ABF2(d_mfcc[p_0:p_1,:],d_mfcc[p_1:p_2,:],d_mfcc[p_0:p_2,:],p_lambda) if delta_ABF2 > 0: jj = 1 else: jj += 1 index2Delete.append(ii) ii += 1 if ii >= len(p_SVF)-1: break # print delta_BIC, p_0, p_1, p_2, p_ABF2 = np.delete(p_SVF,index2Delete) a_ABF2 = np.delete(a_SVF,index2Delete)