Ejemplo n.º 1
0
def get_f0(audio, minf0=20, maxf0=22050, cf=0.9, ws=2048, hs=256):
        '''
        Args:
            audio (array): audio signal (output from MonoLoader)
            minf0 (int): minimum allowed frequency
            maxf0 (int): maximun allowed frequency
            cf (float): confidence threshold (0 - 1)
            ws (int): window size
            hp (int): hop size

        Returns:
            f0 (array):
        '''
        # instantiate Essentia functions
        w = es.Windowing(type='hann', zeroPadding=ws)
        spec = es.Spectrum()
        yin = es.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0, frameSize=ws)

        # empty lists for f0 and confidence
        f0 = []
        conf = []

        # iterate over frames
        for frame in es.FrameGenerator(audio, frameSize=ws, hopSize=hs):
            p, pc = yin(spec(w(frame)))
            f0.append(p)
            conf.append(pc)

        # convert lists to np.arrays
        f0 = np.array(f0)
        conf = np.array(conf)

        # return f0 over given confidence
        f0[conf < cf] = 0
        return f0
Ejemplo n.º 2
0
def segment(audio, hopSize, frameSize, rms_onset_threshold,
            mel_onset_threshold, flux_onset_threshold, onset_threshold):

    # init algorithms
    o_mel = estd.OnsetDetection(method='melflux')
    o_rms = estd.OnsetDetection(method='rms')
    o_hfc = estd.OnsetDetection(method='hfc')
    o_flux = estd.OnsetDetection(method='flux')
    fft = estd.FFT()
    c2p = estd.CartesianToPolar()
    pool = essentia.Pool()
    frame_generator = estd.FrameGenerator(audio,
                                          frameSize=frameSize,
                                          hopSize=hopSize)
    w = estd.Windowing(type='hann')
    yin = estd.PitchYinFFT(frameSize=frameSize,
                           minFrequency=40,
                           maxFrequency=2500,
                           interpolate=True)
    spectrum = estd.Spectrum()
    loudness = estd.Loudness()

    # control parameters
    attack = False
    detection = True
    mel_onset_value = 0
    rms_onset_value = 0

    # output variables
    onset = None
    sustain = None

    for index, frame in enumerate(frame_generator):
        mag, phase = c2p(fft(w(frame)))
        _, conf = yin(spectrum(w(frame)))
        loud = loudness(frame)
        mel_onset = o_mel(mag, phase)
        rms_onset = o_rms(mag, phase)
        hfc_onset = o_hfc(mag, phase)
        flux_onset = o_flux(mag, phase)
        pool.add('onsets_mel', mel_onset)
        pool.add('onsets_rms', rms_onset)
        pool.add('onsets_hfc', hfc_onset)
        pool.add('onsets_flux', flux_onset)
        pool.add('conf', conf)
        pool.add('loudness', loud)

        # condition for onset
        if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \
                and rms_onset > rms_onset_threshold and loud > onset_threshold:
            onset = index
            attack = True
            detection = False
            mel_onset_value = mel_onset
            rms_onset_value = rms_onset
        # condition for beginning of sustain
        if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3:
            attack = False
            sustain = index
    return onset, sustain
def feature_extractor_standard(audio_in, frameSize, hopSize, aggLen):
    
    #print('Starting Feature Extraction for %s',filename)
    
    #creating algorithm objects and pool objects
    win=es.Windowing()
    spec=es.Spectrum()
    centroid = es.Centroid()
    flatness = es.Flatness()
    mfcc=es.MFCC(lowFrequencyBound=40)
    pitchYin = es.PitchYinFFT()
    
    #Compute features frame by frame
    mfcc_ftrsArray = []
    sCentroidArray = []
    sFlatnessArray = []
    pConfArray = []
    
    for frame in es.FrameGenerator(audio_in, frameSize = frameSize, hopSize = hopSize):
        spectrum = spec(win(frame))
        band_eneg, mfcc_ftrs=mfcc(spectrum)
        sCentroid = centroid(spectrum)
        sFlatness = flatness(spectrum)
        pitch, pitchConf = pitchYin(spectrum)
        #sFlux = flux(spectrum)
        
        mfcc_ftrsArray.append(mfcc_ftrs)
        sCentroidArray.append(sCentroid)
        sFlatnessArray.append(sFlatness)
        pConfArray.append(pitchConf)

    meanMFCC = []
    varMFCC = []
    meanCent = []
    varCent = []
    meanFlat = []
    varFlat = []
    meanPConf = []
    varPConf = []
    for ii in xrange(0, len(mfcc_ftrsArray)-aggLen,aggLen):
        meanMFCC.append(np.mean(mfcc_ftrsArray[ii:ii+aggLen],axis=0))
        varMFCC.append(np.var(mfcc_ftrsArray[ii:ii+aggLen],axis=0))
        meanCent.append(np.mean(sCentroidArray[ii:ii+aggLen]))
        varCent.append(np.var(sCentroidArray[ii:ii+aggLen]))
        meanFlat.append(np.mean(sFlatnessArray[ii:ii+aggLen]))
        varFlat.append(np.var(sFlatnessArray[ii:ii+aggLen]))
        meanPConf.append(np.mean(pConfArray[ii:ii+aggLen]))
        varPConf.append(np.var(pConfArray[ii:ii+aggLen]))

    return np.concatenate((np.array(meanMFCC), np.array(varMFCC), np.transpose(np.array(meanCent, ndmin=2)), np.transpose(np.array(varCent, ndmin=2)), np.transpose(np.array(meanFlat,ndmin=2)), np.transpose(np.array(varFlat,ndmin=2)), np.transpose(np.array(meanPConf,ndmin=2)), np.transpose(np.array(varPConf,ndmin=2))),axis=1)
Ejemplo n.º 4
0
def f0Yin(x, N, H, minf0, maxf0):
    # fundamental frequency detection using the Yin algorithm
    # x: input sound, N: window size,
    # minf0: minimum f0 frequency in Hz, maxf0: maximim f0 frequency in Hz,
    # returns f0

    spectrum = ess.Spectrum(size=N)
    window = ess.Windowing(size=N, type='hann')
    pitchYin = ess.PitchYinFFT(minFrequency=minf0, maxFrequency=maxf0)
    pin = 0
    pend = x.size - N
    f0 = []

    while pin < pend:
        mX = spectrum(window(x[pin:pin + N]))
        f0t = pitchYin(mX)
        f0 = np.append(f0, f0t[0])
        pin += H
    return f0
def pitchProcessing_audio(audio):
    N = 2 * framesize  # padding 1 time framesize
    SPECTRUM = ess.Spectrum(size=N)
    WINDOW = ess.Windowing(type='blackmanharris62', zeroPadding=N - framesize)
    PITCHYIN = ess.PitchYinFFT(frameSize=N, sampleRate=fs)

    pitch = []
    pitchConfidence = []
    for frame in ess.FrameGenerator(audio,
                                    frameSize=framesize,
                                    hopSize=hopsize):
        frame = WINDOW(frame)
        mXFrame = SPECTRUM(frame)
        pitchFrame, pitchConfidenceFrame = PITCHYIN(mXFrame)
        pitch.append(pitchFrame)
        pitchConfidence.append(pitchConfidenceFrame)

    # discard pitch below 65, higher than 1000 Hz, confidence below 0.85
    index_keep = discardFrameByConfidence(pitch, pitchConfidence, 65, 1000,
                                          0.85)

    return index_keep
Ejemplo n.º 6
0
def compute(audio, pool, options):
    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = ess.ZeroCrossingRate()

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = ess.BarkBands(sampleRate=sampleRate)
    centralmoments = ess.CentralMoments()
    crest = ess.Crest()
    centroid = ess.Centroid()
    decrease = ess.Decrease()
    spectral_contrast = ess.SpectralContrast(frameSize=frameSize,
                                             sampleRate=sampleRate,
                                             numberBands=6,
                                             lowFrequencyBound=20,
                                             highFrequencyBound=11000,
                                             neighbourRatio=0.4,
                                             staticDistribution=0.15)
    distributionshape = ess.DistributionShape()
    energy = ess.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate)
    energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate)
    energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0,
                                            sampleRate=sampleRate)
    energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate)
    flatnessdb = ess.FlatnessDB()
    flux = ess.Flux()
    harmonic_peaks = ess.HarmonicPeaks()
    hfc = ess.HFC()
    mfcc = ess.MFCC()
    rolloff = ess.RollOff()
    rms = ess.RMS()
    strongpeak = ess.StrongPeak()

    # pitch algorithms
    pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate)
    pitch_salience = ess.PitchSalience()

    # dissonance
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency')
    dissonance = ess.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    #scPool = es.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate]
        # pool.setCurrentScope(frameScope)

        # silence rate
        # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60))
        pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20))

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum ** 2
        pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        #scPool.add(namespace + '.' + 'scvalleys', sc_valleys)
        pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs)


        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands))
        barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape(
            barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace):
        raise ess.EssentiaError('This is a silent file!')

    #spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist]
        # zero pad
        for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist)  # , pool.GlobalScope)

    # the code below is the same as the one above:
    # for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1)
    (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape(
        pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread)  # , pool.GlobalScope)

    progress.finish()
def algorithm_pitch_note_essentia(sound):
    """
    Estimates the note of a given audio file.
    
    :param sound: sound dictionary from dataset
    :return: dictionary with results per different methods
    """
    results = dict()

    audio = load_audio_file(file_path=sound[SOUND_FILE_KEY], sample_rate=44100)
    frameSize = 1024
    hopsize = frameSize

    # Estimate pitch using PitchYin
    frames = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopsize)
    pitchDetect = estd.PitchYin(frameSize=frameSize, sampleRate=44100)
    pitches = []
    confidence = []
    for frame in frames:
        f, conf = pitchDetect(frame)
        pitches += [f]
        confidence += [conf]

    pitches = [pitch for pitch in pitches if pitch > 0]
    if not pitches:
        pitch_median = 0.1
    else:
        pitch_median = median(pitches)
    midi_note = frequency_to_midi_note(pitch_median)
    note = midi_note_to_note(midi_note)
    results.update({
        'EssentiaPitchYin': {
            'note': note,
            'midi_note': midi_note,
            'pitch': pitch_median
        }
    })

    # Estimate pitch using PithYinFFT
    frames = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopsize)
    pitchDetect = estd.PitchYinFFT(frameSize=frameSize, sampleRate=44100)
    win = estd.Windowing(type='hann')
    pitches = []
    confidence = []
    for frame in frames:
        spec = estd.Spectrum()(win(frame))
        f, conf = pitchDetect(spec)
        pitches += [f]
        confidence += [conf]
    pitches = [pitch for pitch in pitches if pitch > 0]
    if not pitches:
        pitch_median = 0.1
    else:
        pitch_median = median(pitches)
    midi_note = frequency_to_midi_note(pitch_median)
    note = midi_note_to_note(midi_note)
    results.update({
        'EssentiaPitchYinFFT': {
            'note': note,
            'midi_note': midi_note,
            'pitch': pitch_median
        }
    })

    return results
Ejemplo n.º 8
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):

    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'


    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)


    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    E = []
    numFrames = 0
    for frame in frames:
        numFrames += 1
        E_frame = energy(frame)
        E.append(E_frame)

    E_max = np.max(E)

    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)

    pools = [(t, es.Pool()) for t in dscr.threshold]
    for frame in frames:

        eNorm = energy(frame) / E_max

        threshPools = []
        for t, pool in pools:
            if eNorm >= t:
                threshPools.append(pool)

        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools]
        #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools]

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        [pool.add('lowlevel.dissonance', diss) for pool in threshPools]

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            [pool.add('sfx.inharmonicity', inharm) for pool in threshPools]

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools]

        c = centroid(mX)
        [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools]

        lat = log_attack_time(frame)
        [pool.add('sfx.logattacktime', lat) for pool in threshPools]

        h = hfc(mX)
        [pool.add('lowlevel.hfc', h) for pool in threshPools]

        spec_complx = spectral_complexity(mX)
        [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools]


    #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean'])
    aggrPools = [calc_Mean_Var(pool) for t, pool in pools]

    features = {}
    [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))]
    json.dump(features, open(outputJsonFile, 'w'))
Ejemplo n.º 9
0
def reComputeDescriptors(inputAudioFile, outputJsonFile):
    """
    :param inputAudioFile:
    :param outputJsonFile:
    :return:
    """

    #help(ess.SpectralContrast)
    """ orig
    M = 1024
    N = 1024
    H = 512
    fs = 44100
    W = 'hann'
    """
    """ freesound
    Real sampleRate = 44100;
    int frameSize =   2048;
    int hopSize =     1024;
    int zeroPadding = 0;

    string silentFrames ="noise";
    string windowType = "blackmanharris62";

    // Silence Rate
    Real thresholds_dB[] = { -20, -30, -60 };
    vector<Real> thresholds(ARRAY_SIZE(thresholds_dB));
    for (uint i=0; i<thresholds.size(); i++) {
        thresholds[i] = db2lin(thresholds_dB[i]/2.0);
    }


    """

    M = 2048
    N = 2048
    H = 1024
    fs = 44100

    W = 'blackmanharris62'
    #silentFrames = "noise"
    #thresholds_dB = np.array([ -20, -30, -60 ])
    #thresholds = np.power (10.0, thresholds_dB / 20)

    #spectrum = ess.Spectrum(size=N)
    spectrum = ess.Spectrum()
    #window = ess.Windowing(size=M, type=W)
    window = ess.Windowing(type=W)
    #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1)
    mfcc = ess.MFCC()

    spectral_peaks = ess.SpectralPeaks(minFrequency=1,
                                       maxFrequency=20000,
                                       maxPeaks=100,
                                       sampleRate=fs,
                                       magnitudeThreshold=0,
                                       orderBy="magnitude")

    dissonance = ess.Dissonance()

    #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs)
    pitch_detection = ess.PitchYinFFT()

    harmonic_peaks = ess.HarmonicPeaks()

    inharmonicity = ess.Inharmonicity()

    #spectral_contrast = ess.SpectralContrast(sampleRate=fs)
    spectral_contrast = ess.SpectralContrast()

    centroid = ess.Centroid()

    log_attack_time = ess.LogAttackTime()

    hfc = ess.HFC()

    energy = ess.Energy()

    x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)()
    frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True)
    pool = es.Pool()
    for frame in frames:
        mX = spectrum(window(frame))
        mfcc_bands, mfcc_coeffs = mfcc(mX)

        pool.add('lowlevel.mfcc', mfcc_coeffs)
        pool.add('lowlevel.mfcc_bands', mfcc_bands)

        pfreq, pmag = spectral_peaks(mX)

        inds = pfreq.argsort()
        pfreq_sorted = pfreq[inds]
        pmag_sorted = pmag[inds]

        diss = dissonance(pfreq_sorted, pmag_sorted)
        pool.add('lowlevel.dissonance', diss)

        pitch, pitch_confidence = pitch_detection(mX)

        phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch)
        if len(phfreq) > 1:
            inharm = inharmonicity(phfreq, phmag)
            pool.add('sfx.inharmonicity', inharm)

        sc_coeffs, sc_valleys = spectral_contrast(mX)
        pool.add('lowlevel.spectral_contrast', sc_coeffs)

        c = centroid(mX)
        pool.add('lowlevel.spectral_centroid', c)

        lat = log_attack_time(frame)
        pool.add('sfx.logattacktime', lat)

        h = hfc(mX)
        pool.add('lowlevel.hfc', h)

    calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var'])
    aggrPool = calc_Mean_Var(pool)

    features = makeFeatures(aggrPool)
    json.dump(features, open(outputJsonFile, 'w'))
Ejemplo n.º 10
0
def compute(audio, pool, options):
    INFO('Computing SFX descriptors...')

    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio,
                                frameSize=frameSize,
                                hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # pitch algorithm
    pitch_detection = ess.PitchYinFFT(frameSize=2048, sampleRate=sampleRate)

    # sfx descriptors
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate,
                                       orderBy='frequency')
    harmonic_peaks = ess.HarmonicPeaks()
    inharmonicity = ess.Inharmonicity()
    odd2evenharmonicenergyratio = ess.OddToEvenHarmonicEnergyRatio()
    tristimulus = ess.Tristimulus()

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5
    progress = Progress(total=total_frames)

    for frame in frames:

        frameScope = [
            start_of_frame / sampleRate,
            (start_of_frame + frameSize) / sampleRate
        ]
        # pool.setCurrentScope(frameScope)

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)

        # spectral peaks based descriptors
        frame_frequencies, frame_magnitudes = spectral_peaks(frame_spectrum)

        # ERROR CORRECTION - hoinx 2015-12
        errIdx = np.where(frame_frequencies < 1)
        frame_frequencies = np.delete(frame_frequencies, errIdx)
        frame_magnitudes = np.delete(frame_magnitudes, errIdx)

        (frame_harmonic_frequencies,
         frame_harmonic_magnitudes) = harmonic_peaks(frame_frequencies,
                                                     frame_magnitudes,
                                                     frame_pitch)
        if len(frame_harmonic_frequencies) > 1:
            frame_inharmonicity = inharmonicity(frame_harmonic_frequencies,
                                                frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'inharmonicity', frame_inharmonicity)
            frame_tristimulus = tristimulus(frame_harmonic_frequencies,
                                            frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'tristimulus', frame_tristimulus)
            frame_odd2evenharmonicenergyratio = odd2evenharmonicenergyratio(
                frame_harmonic_frequencies, frame_harmonic_magnitudes)
            pool.add(namespace + '.' + 'odd2evenharmonicenergyratio',
                     frame_odd2evenharmonicenergyratio)

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    envelope = ess.Envelope()
    file_envelope = envelope(audio)

    # temporal statistics
    decrease = ess.Decrease()
    pool.add(namespace + '.' + 'temporal_decrease',
             decrease(file_envelope))  # , pool.GlobalScope)

    centralmoments = ess.CentralMoments()
    file_centralmoments = centralmoments(file_envelope)

    distributionshape = ess.DistributionShape()
    (file_spread, file_skewness,
     file_kurtosis) = distributionshape(file_centralmoments)
    pool.add(namespace + '.' + 'temporal_spread',
             file_spread)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'temporal_skewness',
             file_skewness)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'temporal_kurtosis',
             file_kurtosis)  # , pool.GlobalScope)

    centroid = ess.Centroid()
    pool.add(namespace + '.' + 'temporal_centroid',
             centroid(file_envelope))  # , pool.GlobalScope)

    # effective duration
    effectiveduration = ess.EffectiveDuration()
    pool.add(namespace + '.' + 'effective_duration',
             effectiveduration(file_envelope))  # , pool.GlobalScope)

    # log attack time
    logattacktime = ess.LogAttackTime()
    pool.add(namespace + '.' + 'logattacktime',
             logattacktime(audio))  # , pool.GlobalScope)

    # strong decay
    strongdecay = ess.StrongDecay()
    pool.add(namespace + '.' + 'strongdecay',
             strongdecay(file_envelope))  # , pool.GlobalScope)

    # dynamic profile
    flatness = ess.FlatnessSFX()
    pool.add(namespace + '.' + 'flatness',
             flatness(file_envelope))  # , pool.GlobalScope)
    """
    # onsets number
    onsets_number = len(pool['rhythm.onset_times'][0])
    pool.add(namespace + '.' + 'onsets_number', onsets_number)  # , pool.GlobalScope)
    """

    # morphological descriptors
    max_to_total = ess.MaxToTotal()
    pool.add(namespace + '.' + 'max_to_total',
             max_to_total(file_envelope))  # , pool.GlobalScope)

    tc_to_total = ess.TCToTotal()
    pool.add(namespace + '.' + 'tc_to_total',
             tc_to_total(file_envelope))  # , pool.GlobalScope)

    derivativeSFX = ess.DerivativeSFX()
    (der_av_after_max, max_der_before_max) = derivativeSFX(file_envelope)
    pool.add(namespace + '.' + 'der_av_after_max',
             der_av_after_max)  # , pool.GlobalScope)
    pool.add(namespace + '.' + 'max_der_before_max',
             max_der_before_max)  # , pool.GlobalScope)

    # pitch profile
    """
    pitch = pool['lowlevel.pitch']

    if len(pitch) > 1:
        pool.add(namespace + '.' + 'pitch_max_to_total', max_to_total(pitch))  # , pool.GlobalScope)

        min_to_total = ess.MinToTotal()
        pool.add(namespace + '.' + 'pitch_min_to_total', min_to_total(pitch))  # , pool.GlobalScope)

        pitch_centroid = ess.Centroid(range=len(pitch) - 1)
        pool.add(namespace + '.' + 'pitch_centroid', pitch_centroid(pitch))  # , pool.GlobalScope)

        pitch_after_max_to_before_max_energy_ratio = ess.AfterMaxToBeforeMaxEnergyRatio()
        pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio',
                 pitch_after_max_to_before_max_energy_ratio(pitch))  # , pool.GlobalScope)

    else:
        pool.add(namespace + '.' + 'pitch_max_to_total', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_min_to_total', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_centroid', 0.0)  # , pool.GlobalScope)
        pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', 0.0)  # , pool.GlobalScope)
    """

    progress.finish()
Ejemplo n.º 11
0
def extractFeatures(audio_data):
    """
  Recebe um vetor de reais representando um sinal de áudio, calcula suas 
  features, agrega-as em uma Pool() de essentia e retorna esta Pool
  """
    from numpy import ndarray
    assert (type(audio_data) is ndarray)
    assert ("float" in str(audio_data.dtype))

    #Inicia Pool()
    output_pool = es.Pool()

    #Calcula espectro do sinal
    output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data))

    #Calcula EnergyBandRatio
    energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum])
    output_pool.set(pk_energy_band_ratio, energy_band_ratio)

    #Calcula MaxMagFreq
    max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum])
    output_pool.set(pk_max_mag_freq, max_mag_freq)

    #Calcula SpectralCentroidTime
    spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data)
    output_pool.set(pk_spectral_centroid_time, spectral_centroid_time)

    #Calcula SpectralComplexity
    spectral_complexity = es_mode.SpectralComplexity()(
        output_pool[pk_spectrum])
    output_pool.set(pk_spectral_complexity, spectral_complexity)

    #Calcula StrongPeak
    strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum])
    output_pool.set(pk_strong_peak, strong_peak)

    #Calcula SpectralPeaks
    sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum])
    #corta o DC, se houver, e pedido de HarmonicPeaks
    if sp_freq[0] == 0:
        sp_freq = sp_freq[1:]
        sp_mag = sp_mag[1:]
    output_pool.set(pk_spectral_peaks_freq, sp_freq)
    output_pool.set(pk_spectral_peaks_mag, sp_mag)

    ######################################
    #       Para Inharmonicity           #
    ######################################
    #Calcula PitchYinFFT
    pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()(
        output_pool[pk_spectrum])
    output_pool.set(pk_pitch, pitch_yin_fft)
    output_pool.set(pk_pitch_prob, pitch_prob_yin_fft)

    #Calcula HarmonicPeaks
    hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\
                                              output_pool[pk_spectral_peaks_mag],\
                                              output_pool[pk_pitch] )
    output_pool.set(pk_harmonic_peaks_freq, hp_freq)
    output_pool.set(pk_harmonic_peaks_mag, hp_mag)

    #Calcula Inharmonicity
    inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\
                                            output_pool[pk_harmonic_peaks_mag])
    output_pool.set(pk_inharmonicity, inharmonicity)

    #Acaba Inharmonicity#####################################

    #Calcula SpectralContrast
    frame_size = 2 * (output_pool[pk_spectrum].size - 1)
    spectral_contrast, spectral_valley = \
        es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum])
    output_pool.set(pk_spectral_contrast, spectral_contrast)
    output_pool.set(pk_spectral_valley, spectral_valley)

    #Calcula SpectralWhitening
    spectral_whitening = \
                es_mode.SpectralWhitening()(output_pool[pk_spectrum],\
                                            output_pool[pk_spectral_peaks_freq],\
                                            output_pool[pk_spectral_peaks_mag])
    output_pool.set(pk_spectral_whitening, spectral_whitening)

    return output_pool