Example #1
0
def extractor(filename):
    frameSize = 1024
    hopSize = 512
    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    w = ess.Windowing(type='hamming', normalized=False)
    # make sure these are same for MFCC and IDCT computation
    NUM_BANDS = 26
    DCT_TYPE = 2
    LIFTERING = 0
    NUM_MFCCs = 13

    spectrum = ess.Spectrum()
    mfcc = ess.MFCC(
        numberBands=NUM_BANDS,
        numberCoefficients=
        NUM_MFCCs,  # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be
        weighting=
        'linear',  # computation of filter weights done in Hz domain (optional)
        normalize=
        'unit_max',  #  htk filter normaliation to have constant height = 1 (optional)
        dctType=DCT_TYPE,
        logType='log',
        liftering=LIFTERING)  # corresponds to htk default CEPLIFTER = 22

    idct = ess.IDCT(inputSize=NUM_MFCCs,
                    outputSize=NUM_BANDS,
                    dctType=DCT_TYPE,
                    liftering=LIFTERING)
    all_melbands_smoothed = []

    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize):
        spect = spectrum(w(frame))
        melbands, mfcc_coeffs = mfcc(spect)
        melbands_smoothed = np.exp(
            idct(mfcc_coeffs))  # inverse the log taken in MFCC computation
        all_melbands_smoothed.append(melbands_smoothed)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    all_melbands_smoothed = essentia.array(all_melbands_smoothed).T

    # and plot
    plt.imshow(all_melbands_smoothed, aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
Example #2
0
    def __init__(self, input_filename, fft_size, numMelBands):
        fft_size_dummy = 1024
        window_function_dummy = np.hanning
        AudioProcessor.__init__(self, input_filename, fft_size_dummy,
                                window_function_dummy)

        #             self.inv_mfcc_transform = InvMFCC() # inverse mfcc transform
        #             self.inv_mfcc_transform.setup()
        self.framesize = 2048  #
        #         self.framesize = 1102 #  default frame size in htk, at rate of 44100
        zeroPadding = fft_size - self.framesize
        self.w = ess.Windowing(
            type='hamming',
            size=self.framesize,
            zeroPadding=zeroPadding,
            #                     normalized = False,
            zeroPhase=False)

        spectrumSize = fft_size // 2 + 1
        self.spectrum = ess.Spectrum(size=fft_size)
        self.mfcc = ess.MFCC(
            inputSize=spectrumSize,  # htk-like  mfccs
            type='magnitude',
            warpingFormula='htkMel',
            weighting='linear',
            highFrequencyBound=8000,
            lowFrequencyBound=0,
            numberBands=numMelBands,
            numberCoefficients=InvMFCCAudioProcessor.NUM_MFCC_COEFFS,
            normalize='unit_max',
            dctType=3,
            logType='log',
            liftering=22)

        self.idct = ess.IDCT(inputSize=InvMFCCAudioProcessor.NUM_MFCC_COEFFS,
                             outputSize=numMelBands,
                             dctType=3,
                             liftering=22)