def MRCG(x, fs=44100, framesize1=0.02, framesize2=0.2, hopsize=0.01):

    hopsize = int(hopsize * fs)
    # spectrogram init
    winAnalysis = 'hann'

    ####---- cochleagram 1
    framesize = int(framesize1 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea1 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea1.append(erbFrame)
    cochlea1 = np.array(cochlea1)

    ####---- cochleagram 2
    framesize = int(framesize2 * fs)
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)
    highFrequencyBound = fs / 2 if fs / 2 < 11000 else 11000
    ERBBANDS = ess.ERBBands(sampleRate=fs,
                            highFrequencyBound=highFrequencyBound,
                            inputSize=framesize + 1)

    cochlea2 = []
    for frame in ess.FrameGenerator(x, frameSize=framesize, hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        erbFrame = np.log10(ERBBANDS(mXFrame) + np.finfo(np.float).eps)
        cochlea2.append(erbFrame)
    cochlea2 = np.array(cochlea2)

    ####---- smoothed version
    cochlea3 = get_avg(cochlea1, 5, 5)
    cochlea4 = get_avg(cochlea1, 11, 11)

    all_cochleas = np.hstack((cochlea1, cochlea2, cochlea3, cochlea4))

    ####---- delta
    d_all_cochleas = Fdeltas(all_cochleas.T)
    dd_all_cochleas = Fdeltas(Fdeltas(all_cochleas.T, 5), 5)

    d_all_cochleas = d_all_cochleas.T
    dd_all_cochleas = dd_all_cochleas.T

    return all_cochleas, d_all_cochleas, dd_all_cochleas
Exemple #2
0
def main_danceability(args):
    """main_danceability

    Compute the danceability feature over input waveform and plot it
    """
    audio = loadaudio(args)
    
    # create the pool and the necessary algorithms
    pool = e.Pool()
    w = estd.Windowing()
    spec = estd.Spectrum()
    centroid = estd.SpectralCentroidTime()

    # compute the centroid for all frames in our audio and add it to the pool
    for frame in estd.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
        c = centroid(spec(w(frame)))
        pool.add('lowlevel.centroid', c)

    # aggregate the results
    aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)


    # create the pool and the necessary algorithms
    pool = e.Pool()
    w = estd.Windowing()
    # spec = estd.Spectrum()
    # centroid = estd.SpectralCentroidTime()
    danceability = estd.Danceability(maxTau = 10000, minTau = 300, sampleRate = args.samplerate)
    
    # compute the centroid for all frames in our audio and add it to the pool
    for frame in estd.FrameGenerator(audio, frameSize = 10 * args.samplerate, hopSize = 5 * args.samplerate):
        dreal, ddfa = danceability(w(frame))
        print(("d", dreal)) # , "frame", frame
        pool.add('rhythm.danceability', dreal)

    print((type(pool['rhythm.danceability'])))
        
    # aggregate the results
    # aggrpool = estd.PoolAggregator(defaultStats = [ 'mean', 'var' ])(pool)
    
    # write result to file
    # estd.YamlOutput(filename = args.file + '.features.yaml')(aggrpool)

    fig, gs = makefig(rows = 2, cols = 2)
    ax = fig.axes

    ax[0].plot(pool['rhythm.danceability'])

    plt.show()
Exemple #3
0
def _key_fnc(
    sample: NDArray[Float32],
    frequency_rate: int,
    windowfnc: Window,
    key_type: KeyFunction,
):
    """
    This function computes the key function,
    which in return calculates the keys for the [this.samples] map.
    To calculate the spectral centroid,
    the frequency_rate should be equal to the half of the samplerate.
    """

    if key_type == KeyFunction.CENTROID:
        return _get_centroid(
            sample,
            estd.Centroid(range=frequency_rate),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MAX:
        return _get_max(
            sample,
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MFCC:
        return _get_mfcc(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS:
        return _get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        )
    if key_type == KeyFunction.MELBANDS_LOG:
        return estd.UnaryOperator(type="log")(_get_melbands(
            sample,
            estd.MFCC(),
            estd.Spectrum(),
            estd.Windowing(type=windowfnc.value),
        ))
    raise ValueError("Keyfunction is not defined!")
Exemple #4
0
    def get_onsets(self, _audio=[]):

        if _audio != []:
            audio = _audio
        else:
            audio = self.audio

        W = es.Windowing(type=self.winType)
        c2p = es.CartesianToPolar()
        fft = es.FFT()
        onsetDetection = es.OnsetDetection(method=self.onsetMethod,
                                           sampleRate=44100)
        onsets = es.Onsets(alpha=.2)
        # onsetIndex = []
        pool = Pool()

        for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
            mag, phase, = c2p(fft(W(frame)))
            onsetDetection.configure(method=self.onsetMethod)
            onsetFunction = onsetDetection(mag, phase)
            pool.add("onsetFunction", onsetFunction)

        DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1])

        return DetectedOnsetsArray
Exemple #5
0
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512):
    """
    extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname.

    Inputs:
        fname   -- is the name of audio file.
        outpath -- is the output path of processed files.
        fs      -- is the sampling frequency (Hz).
        fsize   -- is the size of each frame.
        hsize   -- is the hop size betwean frames.
    Outputs:
        the file contains the mfcc coefficents of audio file.
        in what format???
    """
    #    gate(fname)
    loader = es.MonoLoader(filename=fname, sampleRate=fs)
    #    length = len(loader)
    #    maxim = max(loader)
    #    for sample in loader:
    #        if abs(sample) < maxim/20:
    #            sample = 0 ;

    w = es.Windowing(type='hann')
    spectrum = es.Spectrum()
    mfcc = es.MFCC(inputSize=513, numberCoefficients=20)

    mfccs = []
    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512):
        mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame)))
        mfccs.append(mfcc_coeffs)

    mfccs = np.array(mfccs)
    return mfcc
Exemple #6
0
def segment(audio, hopSize, frameSize, rms_onset_threshold,
            mel_onset_threshold, flux_onset_threshold, onset_threshold):

    # init algorithms
    o_mel = estd.OnsetDetection(method='melflux')
    o_rms = estd.OnsetDetection(method='rms')
    o_hfc = estd.OnsetDetection(method='hfc')
    o_flux = estd.OnsetDetection(method='flux')
    fft = estd.FFT()
    c2p = estd.CartesianToPolar()
    pool = essentia.Pool()
    frame_generator = estd.FrameGenerator(audio,
                                          frameSize=frameSize,
                                          hopSize=hopSize)
    w = estd.Windowing(type='hann')
    yin = estd.PitchYinFFT(frameSize=frameSize,
                           minFrequency=40,
                           maxFrequency=2500,
                           interpolate=True)
    spectrum = estd.Spectrum()
    loudness = estd.Loudness()

    # control parameters
    attack = False
    detection = True
    mel_onset_value = 0
    rms_onset_value = 0

    # output variables
    onset = None
    sustain = None

    for index, frame in enumerate(frame_generator):
        mag, phase = c2p(fft(w(frame)))
        _, conf = yin(spectrum(w(frame)))
        loud = loudness(frame)
        mel_onset = o_mel(mag, phase)
        rms_onset = o_rms(mag, phase)
        hfc_onset = o_hfc(mag, phase)
        flux_onset = o_flux(mag, phase)
        pool.add('onsets_mel', mel_onset)
        pool.add('onsets_rms', rms_onset)
        pool.add('onsets_hfc', hfc_onset)
        pool.add('onsets_flux', flux_onset)
        pool.add('conf', conf)
        pool.add('loudness', loud)

        # condition for onset
        if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \
                and rms_onset > rms_onset_threshold and loud > onset_threshold:
            onset = index
            attack = True
            detection = False
            mel_onset_value = mel_onset
            rms_onset_value = rms_onset
        # condition for beginning of sustain
        if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3:
            attack = False
            sustain = index
    return onset, sustain
Exemple #7
0
def compute_description(x,
                        M=WINDOW_SIZE,
                        N=FFT_SIZE,
                        H=HOP_SIZE,
                        fs=SR,
                        window_type=WINDOW_TYPE):
    '''
    -extract features from audio file
    -Features:
        HFC
        SPECTRAL CENTROID
        SPECTRAL ENERGY
        F0
        loud_factor = energy * (spectral_centroid - F0)  #how many harmonics = how much speaker is yelling
	    PITCH CONFIDENCE
    '''
    #audioLoader = ess.EasyLoader(filename=file_name, sampleRate=fs)
    #create essentia instances
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    hfc = ess.HFC(sampleRate=fs)
    spectralCentroid = ess.SpectralCentroidTime(sampleRate=fs)
    energy = ess.Energy()
    pitch_extractor = ess.PredominantPitchMelodia(frameSize=M,
                                                  hopSize=H,
                                                  maxFrequency=1200)
    #init vectors
    CONTRAST = []
    HFC = []
    CENTROID = []
    ENERGY = []

    #compute features for every stft frame
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        frame_hfc = hfc(mX)
        HFC.append(frame_hfc)
        frame_centroid = spectralCentroid(
            wX)  #compute spectral centroid in time domain
        CENTROID.append(frame_centroid)
        frame_energy = energy(mX)  #compute spectral energy in time domain
        ENERGY.append(frame_energy)

    F0, SALIENCE = pitch_extractor(x)  #estimate pitch in time domain

    #convert into numpy matrices
    HFC = essentia.array(HFC)
    CENTROID = essentia.array(CENTROID)
    ENERGY = essentia.array(ENERGY)
    F0 = essentia.array(F0)
    SALIENCE = essentia.array(SALIENCE)
    F0 = F0[:len(CENTROID)]
    SALIENCE = SALIENCE[:len(CENTROID)]

    return HFC, CENTROID, ENERGY, F0, SALIENCE
Exemple #8
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector and apply power-law compression
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]]
        #DEPRECATED
        #################################################
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression

    return SP
def get_constantq(frames, sample_rate=16000, num_bands=64):
    max_freq = 8000
    min_freq = 125
    num_octaves = np.log2(max_freq / min_freq)
    bins_per_octave = int(np.ceil(num_bands / num_octaves))

    frame_size = len(frames[0])
    const_q_spectra = []

    spectrum_estimator = es.Spectrum(size=frame_size)
    if num_bands == 16:
        padding_size = max([0, 512 - frame_size])
    elif num_bands == 32:
        padding_size = max([0, 2048 - frame_size])
    else:
        padding_size = max([0, 1024 - frame_size])

    windowing = es.Windowing(type='hann',
                             size=frame_size,
                             zeroPadding=padding_size)

    constantq_estimator = es.ConstantQ(binsPerOctave=bins_per_octave,
                                       minFrequency=min_freq,
                                       numberBins=num_bands,
                                       sampleRate=sample_rate)
    for frame in frames:
        const_q_spectrum = constantq_estimator(windowing(frame))
        const_q_spectra.append(np.abs(const_q_spectrum))

    return np.array(const_q_spectra).T
Exemple #10
0
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0):
    """
    Wrap around the essentia library to compute HPCP features
    :param XAudio: A flat array of raw audio samples
    :param Fs: Sample rate
    :param winSize: Window size of each STFT window
    :param hopSize: Hop size between STFT windows
    :param squareRoot: Do square root compression?
    :param NChromaBins: How many chroma bins (default 36)
    :returns H: An (NChromaBins x NWindows) matrix of all \
        chroma windows
    """
    import essentia
    from essentia import Pool, array
    import essentia.standard as ess
    spectrum = ess.Spectrum()
    window = ess.Windowing(size=winSize, type='hann')
    spectralPeaks = ess.SpectralPeaks()
    hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics)
    H = []
    for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True):
        S = spectrum(window(frame))
        freqs, mags = spectralPeaks(S)
        H.append(hpcp(freqs, mags))
    H = np.array(H)
    H = H.T
    if squareRoot:
        H = sqrtCompress(H)
    return H
def extract_features(x,
                     M=Config.WINDOW_SIZE,
                     N=Config.FFT_SIZE,
                     H=Config.HOP_SIZE,
                     fs=Config.FS,
                     window_type=Config.WINDOW_TYPE):
    '''
    Function that extracts spectrogram from an audio signal
    -----------------------
    Input: Samples, window size (int), FFT size (int), Hop size (int),
    Sampling rate, Window type (e.g. Hanning)

    Output: Spectrogram
    -----------------------
    '''
    # init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=window_type)
    SP = []
    # compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  # generate frames
        wX = window(frame)  # window frame
        mX = spectrum(wX)  # compute fft

        SP.append(mX)
    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  # power law compression
    SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)]

    return SP
def analysisSynthesis(params, signal):

    outsignal = array(0)
    # framecutter >  windowing > FFT > IFFT > OverlapAdd
    frames = cutFrames(params, signal)

    w = std.Windowing(type="hann")
    fft = std.FFT(size=params['frameSize'])
    ifft = std.IFFT(size=params['frameSize'])
    overl = std.OverlapAdd(frameSize=params['frameSize'],
                           hopSize=params['hopSize'])
    counter = 0
    for f in frames:
        #outframe = OverlapAdd(frameSize = params['frameSize'], hopSize = params['hopSize'])(IFFT(size = params['frameSize'])(FFT(size = params['frameSize'])(Windowing()(f))))

        # STFT analysis
        infft = fft(w(f))
        # here we could apply spectral transformations
        outfft = infft

        # STFT synthesis
        ifftframe = ifft(outfft)
        of = ifftframe
        outframe = overl(of)

        if counter >= (params['frameSize'] / (2 * params['hopSize'])):
            outsignal = numpy.append(outsignal, outframe)

        counter += 1

    return outsignal
Exemple #13
0
def file_to_hpcp(loop):
    loop = e.array(loop)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    spectral_peaks = es.SpectralPeaks(orderBy='magnitude',
                                      magnitudeThreshold=0.001,
                                      maxPeaks=20,
                                      minFrequency=20,
                                      maxFrequency=8000)
    hpcp = es.HPCP(maxFrequency=8000)
    spec_group = []
    hpcp_group = []
    for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512):
        windowed = windowing(frame)
        fft = spectrum(windowed)
        frequencies, magnitudes = spectral_peaks(fft)
        final_hpcp = hpcp(frequencies, magnitudes)
        spec_group.append(fft)
        hpcp_group.append(final_hpcp)

    mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1)
    #normalize to 1
    mean_hpcp = mean_hpcp / mean_hpcp.max()

    return mean_hpcp
Exemple #14
0
def analysisSynthesis(params, signal):

    outsignal = array(0)
    signal  = numpy.append(signal, zeros(params['frameSize']/2))
    
    frames = cutFrames(params, signal)
    
    w = std.Windowing(type = "hann");
    fft = std.FFT(size = params['frameSize']);
    ifft = std.IFFT(size = params['frameSize']);    
    overl = std.OverlapAdd (frameSize = params['frameSize'], hopSize = params['hopSize'], gain = 1./params['frameSize']);    
    counter = 0
    for f in frames:
      
      
      # STFT analysis
      infft = fft(w(f))
      # here we could apply spectral transformations
      outfft = infft
    
      # STFT synthesis
      ifftframe = ifft(outfft)
      of = ifftframe
      outframe = overl(of)
      
      if counter >= (params['frameSize']/(2*params['hopSize'])):
        outsignal = numpy.append(outsignal,outframe)

      counter += 1

    
    return outsignal
def getMBE(audio):
    '''
    mel band energy feature
    :param audio:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfccBands = []
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):

        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        mfccBands.append(bands)
    feature = np.array(mfccBands)
    return feature
def get_lpc(frames, sample_rate=16000, num_coeffs=32, window_type='hann'):
    '''
    Calculates linear prediction coefficients

    Parameters:
    frames          : overlapping signal frames for short-time analysis
    sample_rate     : audio sampling rate,    
    num_coeffs      : number of linear prediction coefficients
    window_type     : type of windowing function to apply

    Returns two numpy 2D arrays: LPCs and reflection coefficients
    '''
    frame_size = len(frames[0])
    lpc_coeffs = []
    reflection_coeffs = []

    lpc_estimator = es.LPC(sampleRate=sample_rate, order=num_coeffs - 1)
    windowing = es.Windowing(type='hann', size=frame_size)

    for frame in frames:
        lpc, reflection = lpc_estimator(windowing(frame) * 1000)
        lpc_coeffs.append(lpc)
        reflection_coeffs.append(reflection)

    return np.array(lpc_coeffs).T, np.array(reflection_coeffs).T
def get_wavelet_envelopes(frames, level=5, window_type='hann'):
    '''
    Decomposes input audio with wavelet packets and calculates energy envelopes of their components

    Parameters:
    frames          : overlapping signal frames for short-time analysis
    level           : number of levels of wavelet decomposition, 2**level gives the final number of wavelet components
    window_type     : type of windowing function to apply

    Returns numpy 2D array af 
    '''

    frame_size = len(frames[0])
    num_bands = 2**level
    output_envelopes = {i: [] for i in range(num_bands)}
    windowing = es.Windowing(type='hann', size=frame_size)

    for frame in frames:
        wp = pywt.WaveletPacket(data=windowing(frame),
                                wavelet='db1',
                                mode='zero',
                                maxlevel=level)
        for i in range(num_bands):
            band_key = bin(i).replace('0b', '').zfill(level).replace(
                '0', 'a').replace('1', 'd')
            output_envelopes[i].append(np.std(wp[band_key].data))

    output_array = []
    for item in output_envelopes.values():
        output_array.append(list(item))

    return np.array(output_array)
Exemple #18
0
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256):
        '''
        Args:
            audio (array): audio signal (output from MonoLoader)
            minf0 (int): minimum allowed frequency
            maxf0 (int): maximun allowed frequency
            cf (float): confidence threshold (0 - 1)
            ws (int): window size
            hp (int): hop size

        Returns:
            f0 (array):
        '''
        # instantiate Essentia functions
        w = es.Windowing(type='hann', zeroPadding=ws)
        spec = es.Spectrum()
        yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws)

        # empty lists for f0 and confidence
        f0 = []
        conf = []

        # iterate over frames
        for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs):
            p, pc = yin(spec(w(frame)))
            f0.append(p)
            conf.append(pc)

        # convert lists to np.arrays
        f0 = np.array(f0)
        conf = np.array(conf)

        # return f0 over given confidence
        f0[conf < cf] = 0
        return f0
Exemple #19
0
def extract_features(x,
                     M=WINDOW_SIZE,
                     N=FFT_SIZE,
                     H=HOP_SIZE,
                     fs=SR,
                     window_type=WINDOW_TYPE):
    '''
    extract magnitudes spectra from input vector
    apply power-law compression
    cutt the upper spectrum
    '''
    #init functions and vectors
    x = essentia.array(x)
    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=M, type=WINDOW_TYPE)
    SP = []

    #compute STFT
    for frame in ess.FrameGenerator(x,
                                    frameSize=M,
                                    hopSize=H,
                                    startFromZero=True):  #generate frames
        wX = window(frame)  #window frame
        mX = spectrum(wX)  #compute fft
        SP.append(mX)

    SP = essentia.array(SP)
    SP = np.power(SP, 2. / 3.)  #power law compression
    #SP = SP[:,:int(FFT_SIZE/2+1)]  #cut upper spectrum (above 4 khz)

    return SP
Exemple #20
0
    def _extract_pitch_contours(self, audio):
        # Hann window with x4 zero padding
        run_windowing = estd.Windowing(  # pylint: disable-msg=E1101
            zeroPadding=3 * self.frame_size)
        run_spectrum = estd.Spectrum(  # pylint: disable-msg=E1101
            size=self.frame_size * 4)
        run_spectral_peaks = estd.SpectralPeaks(  # pylint: disable-msg=E1101
            minFrequency=self.min_frequency,
            maxFrequency=self.max_frequency,
            magnitudeThreshold=self.magnitude_threshold,
            sampleRate=self.sample_rate,
            orderBy='magnitude')

        # convert unit to cents, PitchSalienceFunction takes 55 Hz as the
        # default reference
        run_pitch_salience_function = \
            estd.PitchSalienceFunction(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution)
        run_pitch_salience_function_peaks = \
            estd.PitchSalienceFunctionPeaks(  # pylint: disable-msg=E1101
                binResolution=self.bin_resolution,
                minFrequency=self.min_frequency,
                maxFrequency=self.max_frequency)
        run_pitch_contours = estd.PitchContours(  # pylint: disable-msg=E1101
            hopSize=self.hop_size,
            binResolution=self.bin_resolution,
            peakDistributionThreshold=self.peak_distribution_threshold)

        # compute frame by frame
        pool = Pool()
        for frame in estd.FrameGenerator(
                audio,  # pylint: disable-msg=E1101
                frameSize=self.frame_size,
                hopSize=self.hop_size):
            frame = run_windowing(frame)
            spectrum = run_spectrum(frame)
            peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum)
            salience = run_pitch_salience_function(peak_frequencies,
                                                   peak_magnitudes)
            salience_peaks_bins, salience_peaks_contour_saliences = \
                run_pitch_salience_function_peaks(salience)
            if not np.size(salience_peaks_bins):
                salience_peaks_bins = np.array([0])
            if not np.size(salience_peaks_contour_saliences):
                salience_peaks_contour_saliences = np.array([0])

            pool.add('allframes_salience_peaks_bins', salience_peaks_bins)
            pool.add('allframes_salience_peaks_contourSaliences',
                     salience_peaks_contour_saliences)

        # post-processing: contour tracking
        contours_bins, contour_saliences, contours_start_times, duration = \
            run_pitch_contours(
                [f.tolist()
                 for f in pool['allframes_salience_peaks_bins']],
                [f.tolist()
                 for f in pool['allframes_salience_peaks_contourSaliences']])
        return contours_bins, contours_start_times, contour_saliences, duration
Exemple #21
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        mfccs.append(mfcc_coeffs)

    # transpose to have it in a better shape
    # we need to convert the list to an essentia.array first (== numpy.array of floats)
    # mfccs = essentia.array(pool['MFCC']).T
    mfccs = essentia.array(mfccs).T

    # and plot
    plt.imshow(mfccs[1:, :], aspect='auto',
               interpolation='none')  # ignore enery
    # plt.imshow(mfccs, aspect = 'auto', interpolation='none')
    plt.show()  # unnecessary if you started "ipython --pylab"
Exemple #22
0
    def essentiaObjectInit(self):
        winAnalysis = 'hann'
        self.MFCC80 = ess.MFCC(sampleRate=self.fs,
                          highFrequencyBound=self.highFrequencyBound,
                          inputSize=self.frameSize + 1,
                          numberBands=self.numberBands)

        N = 2 * self.frameSize  # padding 1 time framesize
        self.SPECTRUM = ess.Spectrum(size=N)
        self.WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - self.frameSize)
Exemple #23
0
def gen_frames(filepath):
    """Cuts audio into many frames"""
    # Convert file to mono raw audio
    audio = es.MonoLoader(filename=filepath, sampleRate=sample_rate)()
    
    # Cut audio into frames and expand them into windowed frames for better processing
    frame_gen = es.FrameGenerator(audio, frameSize=samples_per_frame, hopSize=hop_length)
    frames = np.array([es.Windowing(size=samples_per_frame, type=window_type)(frame)
                            for frame in frame_gen])
    
    return frames
Exemple #24
0
 def __init__(self, frame_size, hop_size, window_type, feature,
         beats, sample_rate):
     """STFTFeature constructor."""
     self.frame_size = frame_size
     self.hop_size = hop_size
     self.window_type = window_type
     self.w = ES.Windowing(type=window_type)
     self.spectrum = ES.Spectrum()
     self.feature = feature  # Essentia feature object
     self.beats = beats
     self.sample_rate = sample_rate
Exemple #25
0
    def mfcc_htk(self, window_length=22050, nmfcc=13, n_mels=26, fmax=8000, lifterexp=22):
        """
        Get MFCCs 'the HTK way' with the help of Essentia
        https://github.com/MTG/essentia/blob/master/src/examples/tutorial/example_mfcc_the_htk_way.py
        Using all of the default parameters from there except the hop length (which shouldn't matter), and a much longer window length (which has been found to work better for covers)
        Parameters
        ----------
        window_length: int
            Length of the window to use for the STFT
        nmfcc: int
            Number of MFCC coefficients to compute
        n_mels: int
            Number of frequency bands to use
        fmax: int
            Maximum frequency
        Returns
        -------
        ndarray(nmfcc, nframes)
            An array of all of the MFCC frames
        """
        fftlen = int(2**(np.ceil(np.log(window_length)/np.log(2))))
        spectrumSize= fftlen//2+1
        zeroPadding = fftlen - window_length

        w = estd.Windowing(type = 'hamming', #  corresponds to htk default  USEHAMMING = T
                            size = window_length, 
                            zeroPadding = zeroPadding,
                            normalized = False,
                            zeroPhase = False)
        
        spectrum = estd.Spectrum(size=fftlen)
        mfcc_htk = estd.MFCC(inputSize = spectrumSize,
                            type = 'magnitude', # htk uses mel filterbank magniude
                            warpingFormula = 'htkMel', # htk's mel warping formula
                            weighting = 'linear', # computation of filter weights done in Hz domain
                            highFrequencyBound = fmax, # 8000 is htk default
                            lowFrequencyBound = 0, # corresponds to htk default
                            numberBands = n_mels, # corresponds to htk default  NUMCHANS = 26
                            numberCoefficients = nmfcc,
                            normalize = 'unit_max', # htk filter normaliation to have constant height = 1  
                            dctType = 3, # htk uses DCT type III
                            logType = 'log',
                            liftering = lifterexp) # corresponds to htk default CEPLIFTER = 22


        mfccs = []
        # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
        for frame in estd.FrameGenerator(self.audio_vector, frameSize = window_length, hopSize = self.hop_length , startFromZero = True, validFrameThresholdRatio = 1):
            spect = spectrum(w(frame))
            mel_bands, mfcc_coeffs = mfcc_htk(spect)
            mfccs.append(mfcc_coeffs)
        
        return np.array(mfccs, dtype=np.float32).T
def melspectrogram(audio,
                   sampleRate=44100,
                   frameSize=2048,
                   hopSize=1024,
                   window='blackmanharris62',
                   zeroPadding=0,
                   center=True,
                   numberBands=[128, 96, 48, 32, 24, 16, 8],
                   lowFrequencyBound=0,
                   highFrequencyBound=None,
                   weighting='linear',
                   warpingFormula='slaneyMel',
                   normalize='unit_tri'):

    if highFrequencyBound is None:
        highFrequencyBound = sampleRate / 2

    windowing = es.Windowing(type=window,
                             normalized=False,
                             zeroPadding=zeroPadding)
    spectrum = es.Spectrum()
    melbands = {}
    for nBands in numberBands:
        melbands[nBands] = es.MelBands(
            numberBands=nBands,
            sampleRate=sampleRate,
            lowFrequencyBound=lowFrequencyBound,
            highFrequencyBound=highFrequencyBound,
            inputSize=(frameSize + zeroPadding) // 2 + 1,
            weighting=weighting,
            normalize=normalize,
            warpingFormula=warpingFormula,
            type='power')
    norm10k = es.UnaryOperator(type='identity', shift=1, scale=10000)
    log10 = es.UnaryOperator(type='log10')
    amp2db = es.UnaryOperator(type='lin2db', scale=2)

    results = essentia.Pool()

    for frame in es.FrameGenerator(audio,
                                   frameSize=frameSize,
                                   hopSize=hopSize,
                                   startFromZero=not center):
        spectrumFrame = spectrum(windowing(frame))

        for nBands in numberBands:
            melFrame = melbands[nBands](spectrumFrame)
            results.add('mel_' + str(nBands) + '_db', amp2db(melFrame))
            results.add('mel_' + str(nBands) + '_log1+10kx',
                        log10(norm10k(melFrame)))

    return results
Exemple #27
0
def extractor(filename):

    fs = 44100
    audio = ess.MonoLoader(filename=filename, sampleRate=fs)()
    # dynamic range expansion as done in HTK implementation
    audio = audio * 2**15

    frameSize = 1102  # corresponds to htk default WINDOWSIZE = 250000.0
    hopSize = 441  # corresponds to htk default TARGETRATE = 100000.0
    fftSize = 2048
    spectrumSize = fftSize // 2 + 1
    zeroPadding = fftSize - frameSize

    w = ess.Windowing(
        type='hamming',  #  corresponds to htk default  USEHAMMING = T
        size=frameSize,
        zeroPadding=zeroPadding,
        normalized=False,
        zeroPhase=False)

    spectrum = ess.Spectrum(size=fftSize)

    mfcc_htk = ess.MFCC(
        inputSize=spectrumSize,
        type='magnitude',  # htk uses mel filterbank magniude
        warpingFormula='htkMel',  # htk's mel warping formula
        weighting='linear',  # computation of filter weights done in Hz domain
        highFrequencyBound=8000,  # corresponds to htk default
        lowFrequencyBound=0,  # corresponds to htk default
        numberBands=26,  # corresponds to htk default  NUMCHANS = 26
        numberCoefficients=13,
        normalize=
        'unit_max',  # htk filter normaliation to have constant height = 1  
        dctType=3,  # htk uses DCT type III
        logType='log',
        liftering=22)  # corresponds to htk default CEPLIFTER = 22

    mfccs = []
    # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows
    for frame in ess.FrameGenerator(audio,
                                    frameSize=frameSize,
                                    hopSize=hopSize,
                                    startFromZero=True,
                                    validFrameThresholdRatio=1):
        spect = spectrum(w(frame))
        mel_bands, mfcc_coeffs = mfcc_htk(spect)
        #frame_energy = energy_func(frame)
        #mfccs.append(numpy.append(mfcc_coeffs, frame_energy))
        mfccs.append(mfcc_coeffs)

    return mfccs
def getFeature(audio, d=True, nbf=False):
    '''
    MFCC of give audio interval [p[0],p[1]]
    :param audio:
    :param p:
    :return:
    '''

    winAnalysis = 'hann'

    # this MFCC is for pattern classification, which numberBands always be by default
    MFCC40 = ess.MFCC(sampleRate=fs,
                      highFrequencyBound=highFrequencyBound,
                      inputSize=framesize + 1)

    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type=winAnalysis, zeroPadding=N - framesize)

    mfcc = []
    # audio_p = audio[p[0]*fs:p[1]*fs]
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        bands, mfccFrame = MFCC40(mXFrame)
        # mfccFrame       = mfccFrame[1:]
        mfcc.append(mfccFrame)

    if d:
        mfcc = np.array(mfcc).transpose()
        dmfcc = Fdeltas(mfcc, w=5)
        ddmfcc = Fdeltas(dmfcc, w=5)
        feature = np.transpose(np.vstack((mfcc, dmfcc, ddmfcc)))
    else:
        feature = np.array(mfcc)

    if not d and nbf:
        mfcc = np.array(mfcc).transpose()
        mfcc_out = np.array(mfcc, copy=True)
        for w_r in range(1, 6):
            mfcc_right_shifted = Fprev_sub(mfcc, w=w_r)
            mfcc_left_shifted = Fprev_sub(mfcc, w=-w_r)
            mfcc_out = np.vstack(
                (mfcc_out, mfcc_left_shifted, mfcc_right_shifted))
        feature = np.array(np.transpose(mfcc_out), dtype='float32')

    # print feature.shape

    return feature
def FeatureExtraction_Recording(recording, params):

    numBins = params.numbins
    fs = params.fs
    # LOAD Audio file
    Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)()
    Audio = ess.DCRemoval()(Audio)  # PREPROCESSING / DC removal
    Audio = ess.EqualLoudness()(Audio)  # PREPROCESSING - Equal Loudness Filter

    # Windowing Parameters (first converting from msec to number of samples)
    # assuring windowSize and hopSize are even
    windowSize = round(fs * params.windowSize / 1000)
    windowSize = int(windowSize / 2) * 2
    hopSize = round(fs * params.hopSize / 1000)
    hopSize = int(hopSize / 2) * 2

    tonic = float(recording.tonic)

    # FRAME-BASED Spectral Analysis
    hpcp = []
    for frame in ess.FrameGenerator(Audio,
                                    frameSize=windowSize,
                                    hopSize=hopSize,
                                    startFromZero=True):
        frame = ess.Windowing(size=windowSize,
                              type=params.windowFunction)(frame)
        mX = ess.Spectrum(size=windowSize)(frame)
        mX[mX < np.finfo(float).eps] = np.finfo(float).eps
        # EXTRACT frequency and magnitude information of the harmonic spectral peaks
        freq, mag = ess.SpectralPeaks()(mX)
        # harmonic pitch-class profiles
        hpcp.append(
            ess.HPCP(normalized='unitSum',
                     referenceFrequency=tonic,
                     size=numBins,
                     windowSize=12 / numBins)(freq, mag))
    recording.chroma_framebased = np.array(hpcp)

    # FEATURE SUMMARIZATION
    mean_chroma = []
    # global Mean of HPCP vectors
    std_chroma = []
    # global standard deviation of HPCP vectors
    for j in range(numBins):
        tmp = []
        for i in range(len(recording.chroma_framebased)):
            tmp.append(recording.chroma_framebased[i][j])
        mean_chroma.append(np.mean(tmp))
        std_chroma.append(np.std(tmp))
    recording.chroma_mean = mean_chroma
    recording.chroma_std = std_chroma
def analyze_misc(filename, segment_duration=20):

    # Compute replay gain and duration on the entire file, then load the
    # segment that is centered in time with replaygain applied
    audio = es.MonoLoader(filename=filename)()
    replaygain = es.ReplayGain()(audio)

    segment_start = (len(audio) / 44100 - segment_duration) / 2
    segment_end = segment_start + segment_duration

    if segment_start < 0 or segment_end > len(audio) / 44100:
        raise ValueError(
            'Segment duration is larger than the input audio duration')

    loader = es.EasyLoader(filename=filename,
                           replayGain=replaygain,
                           startTime=segment_start,
                           endTime=segment_end)

    windowing = es.Windowing(type='blackmanharris62')
    spectrum = es.Spectrum()
    powerspectrum = es.PowerSpectrum()
    centroid = es.Centroid()
    zcr = es.ZeroCrossingRate()
    rms = es.RMS()
    hfc = es.HFC()
    pool = essentia.Pool()

    audio = loader()
    for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024):
        frame_spectrum = spectrum(windowing(frame))
        pool.add('rms', rms(frame))
        pool.add('rms_spectrum', rms(frame_spectrum))
        pool.add('hfc', hfc(frame_spectrum))
        pool.add('spectral_centroid', centroid(frame_spectrum))
        pool.add('zcr', zcr(frame))

    audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)()
    # Ugly hack because we don't have a StereoResample
    left, right = es.StereoDemuxer()(audio_st)
    resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100)
    left = resampler(left)
    right = resampler(right)
    audio_st = es.StereoMuxer()(left, right)
    audio_st = es.StereoTrimmer(startTime=segment_start,
                                endTime=segment_end)(audio_st)
    ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100,
                                                startAtZero=True)(audio_st)
    pool.set('ebu_momentary', ebu_momentary)

    return pool