def extractor(filename): frameSize = 1024 hopSize = 512 fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() w = ess.Windowing(type='hamming', normalized=False) # make sure these are same for MFCC and IDCT computation NUM_BANDS = 26 DCT_TYPE = 2 LIFTERING = 0 NUM_MFCCs = 13 spectrum = ess.Spectrum() mfcc = ess.MFCC( numberBands=NUM_BANDS, numberCoefficients= NUM_MFCCs, # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be weighting= 'linear', # computation of filter weights done in Hz domain (optional) normalize= 'unit_max', # htk filter normaliation to have constant height = 1 (optional) dctType=DCT_TYPE, logType='log', liftering=LIFTERING) # corresponds to htk default CEPLIFTER = 22 idct = ess.IDCT(inputSize=NUM_MFCCs, outputSize=NUM_BANDS, dctType=DCT_TYPE, liftering=LIFTERING) all_melbands_smoothed = [] for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): spect = spectrum(w(frame)) melbands, mfcc_coeffs = mfcc(spect) melbands_smoothed = np.exp( idct(mfcc_coeffs)) # inverse the log taken in MFCC computation all_melbands_smoothed.append(melbands_smoothed) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T all_melbands_smoothed = essentia.array(all_melbands_smoothed).T # and plot plt.imshow(all_melbands_smoothed, aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def __init__(self, input_filename, fft_size, numMelBands): fft_size_dummy = 1024 window_function_dummy = np.hanning AudioProcessor.__init__(self, input_filename, fft_size_dummy, window_function_dummy) # self.inv_mfcc_transform = InvMFCC() # inverse mfcc transform # self.inv_mfcc_transform.setup() self.framesize = 2048 # # self.framesize = 1102 # default frame size in htk, at rate of 44100 zeroPadding = fft_size - self.framesize self.w = ess.Windowing( type='hamming', size=self.framesize, zeroPadding=zeroPadding, # normalized = False, zeroPhase=False) spectrumSize = fft_size // 2 + 1 self.spectrum = ess.Spectrum(size=fft_size) self.mfcc = ess.MFCC( inputSize=spectrumSize, # htk-like mfccs type='magnitude', warpingFormula='htkMel', weighting='linear', highFrequencyBound=8000, lowFrequencyBound=0, numberBands=numMelBands, numberCoefficients=InvMFCCAudioProcessor.NUM_MFCC_COEFFS, normalize='unit_max', dctType=3, logType='log', liftering=22) self.idct = ess.IDCT(inputSize=InvMFCCAudioProcessor.NUM_MFCC_COEFFS, outputSize=numMelBands, dctType=3, liftering=22)