Esempio n. 1
0
def spectralFlux(audio,params):
    """ hop size, frame size, window type """
    hopSize, frameSize, wtype = params
    w = Windowing(type=wtype)
    spec = Spectrum()
    result = []
    Flux = ess.Flux()
    for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize):
        sf = spec(w(frame))
        result.append(Flux(sf))
    return np.asarray(result),hopSize
Esempio n. 2
0
w = es.Windowing(type='hann')
spectrum = es.Spectrum()
centroid = es.Centroid()
moments = es.CentralMoments()

# Temporal descriptors
power = es.InstantPower()
log_attack_time = es.LogAttackTime()
effective_duration = es.EffectiveDuration()
auto_correlation = es.AutoCorrelation()
zero_crossing_rate = es.ZeroCrossingRate()

# Spectral descriptors
peak_freq = es.MaxMagFreq()
roll_off = es.RollOff()
flux = es.Flux()
flatness = es.Flatness()

# Harmonic descriptors
pitch = es.PitchYin(frameSize=1024)
spectral_peaks = es.SpectralPeaks(minFrequency=1e-5)
harmonic_peaks = es.HarmonicPeaks()
inharmonicity = es.Inharmonicity()
oer = es.OddToEvenHarmonicEnergyRatio()
tristimulus = es.Tristimulus()

# MFCC
mfcc = es.MFCC(inputSize=513)


class Audio:
Esempio n. 3
0
def compute(audio, pool, options):
    # analysis parameters
    sampleRate = options['sampleRate']
    frameSize = options['frameSize']
    hopSize = options['hopSize']
    windowType = options['windowType']

    # temporal descriptors
    lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate)
    zerocrossingrate = ess.ZeroCrossingRate()

    # frame algorithms
    frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize)
    window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType)
    spectrum = ess.Spectrum(size=frameSize)

    # spectral algorithms
    barkbands = ess.BarkBands(sampleRate=sampleRate)
    centralmoments = ess.CentralMoments()
    crest = ess.Crest()
    centroid = ess.Centroid()
    decrease = ess.Decrease()
    spectral_contrast = ess.SpectralContrast(frameSize=frameSize,
                                             sampleRate=sampleRate,
                                             numberBands=6,
                                             lowFrequencyBound=20,
                                             highFrequencyBound=11000,
                                             neighbourRatio=0.4,
                                             staticDistribution=0.15)
    distributionshape = ess.DistributionShape()
    energy = ess.Energy()
    # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers
    energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate)
    energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate)
    energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0,
                                            sampleRate=sampleRate)
    energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate)
    flatnessdb = ess.FlatnessDB()
    flux = ess.Flux()
    harmonic_peaks = ess.HarmonicPeaks()
    hfc = ess.HFC()
    mfcc = ess.MFCC()
    rolloff = ess.RollOff()
    rms = ess.RMS()
    strongpeak = ess.StrongPeak()

    # pitch algorithms
    pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate)
    pitch_salience = ess.PitchSalience()

    # dissonance
    spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency')
    dissonance = ess.Dissonance()

    # spectral complexity
    # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame
    spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005)

    INFO('Computing Low-Level descriptors...')

    # used for a nice progress display
    total_frames = frames.num_frames()
    n_frames = 0
    start_of_frame = -frameSize * 0.5

    pitches, pitch_confidences = [], []

    progress = Progress(total=total_frames)

    #scPool = es.Pool()  # pool for spectral contrast

    for frame in frames:

        frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate]
        # pool.setCurrentScope(frameScope)

        # silence rate
        # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame))
        pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60))
        pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30))
        pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20))

        if options['skipSilence'] and es.isSilent(frame):
            total_frames -= 1
            start_of_frame += hopSize
            continue

        # temporal descriptors
        pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame))
        (frame_lpc, frame_lpc_reflection) = lpc(frame)
        pool.add(namespace + '.' + 'temporal_lpc', frame_lpc)

        frame_windowed = window(frame)
        frame_spectrum = spectrum(frame_windowed)

        # spectrum-based descriptors
        power_spectrum = frame_spectrum ** 2
        pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum))
        pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum))
        pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum))
        pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum))
        pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum))

        # central moments descriptors
        frame_centralmoments = centralmoments(power_spectrum)
        (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
        pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis)
        pool.add(namespace + '.' + 'spectral_spread', frame_spread)
        pool.add(namespace + '.' + 'spectral_skewness', frame_skewness)

        # dissonance
        (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum)
        frame_dissonance = dissonance(frame_frequencies, frame_magnitudes)
        pool.add(namespace + '.' + 'dissonance', frame_dissonance)

        # mfcc
        (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
        pool.add(namespace + '.' + 'mfcc', frame_mfcc)

        # spectral contrast
        (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum)
        #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs)
        #scPool.add(namespace + '.' + 'scvalleys', sc_valleys)
        pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs)


        # barkbands-based descriptors
        frame_barkbands = barkbands(frame_spectrum)
        pool.add(namespace + '.' + 'barkbands', frame_barkbands)
        pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands))
        pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands))
        barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1)
        (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape(
            barkbands_centralmoments(frame_barkbands))
        pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread)
        pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness)
        pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis)

        # pitch descriptors
        frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum)
        if frame_pitch > 0 and frame_pitch <= 20000.:
            pool.add(namespace + '.' + 'pitch', frame_pitch)
        pitches.append(frame_pitch)
        pitch_confidences.append(frame_pitch_confidence)
        pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence)

        frame_pitch_salience = pitch_salience(frame_spectrum[:-1])
        pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience)

        # spectral complexity
        pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum))

        # display of progress report
        progress.update(n_frames)

        n_frames += 1
        start_of_frame += hopSize

    # if no 'temporal_zerocrossingrate' it means that this is a silent file
    if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace):
        raise ess.EssentiaError('This is a silent file!')

    #spectralContrastPCA(scPool, pool)

    # build pitch value histogram
    from math import log
    from numpy import bincount
    # convert from Hz to midi notes
    midipitches = []
    unknown = 0
    for freq in pitches:
        if freq > 0. and freq <= 12600:
            midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.)
        else:
            unknown += 1

    if len(midipitches) > 0:
        # compute histogram
        midipitchhist = bincount(midipitches)
        # set 0 midi pitch to be the number of pruned value
        midipitchhist[0] = unknown
        # normalise
        midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist]
        # zero pad
        for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0)
    else:
        midipitchhist = [0.] * 128
        midipitchhist[0] = 1.

    # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist)  # , pool.GlobalScope)

    # the code below is the same as the one above:
    # for note in midipitchhist:
    #    pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note)
    #    print "midi note:", note

    pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1)
    (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape(
        pitch_centralmoments(midipitchhist))
    pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread)  # , pool.GlobalScope)

    progress.finish()
Esempio n. 4
0
import numpy as np
import matplotlib.pyplot as plt
import essentia.standard as ess

M = 1024
N = 1024
H = 512
fs = 44100
spectrum = ess.Spectrum(size=N)
window = ess.Windowing(size=M, type='hann')
flux = ess.Flux()
onsetDetection = ess.OnsetDetection(method='hfc')
x = ess.MonoLoader(filename='../../../sounds/speech-male.wav', sampleRate=fs)()
fluxes = []
onsetDetections = []

for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True):
    mX = spectrum(window(frame))
    flux_val = flux(mX)
    fluxes.append(flux_val)
    onsetDetection_val = onsetDetection(mX, mX)
    onsetDetections.append(onsetDetection_val)
onsetDetections = np.array(onsetDetections)
fluxes = np.array(fluxes)

plt.figure(1, figsize=(9.5, 7))
plt.subplot(2, 1, 1)

plt.plot(np.arange(x.size) / float(fs), x)
plt.axis([0, x.size / float(fs), min(x), max(x)])
plt.ylabel('amplitude')
def compute_features(complete_path):
    result = []
    meta_result = []
    file_count = 0
    # for loop over files
    for file in os.listdir(complete_path):
        if file.endswith(".wav"):
            file_count+=1
            # print(file +' : ' + str(file_count))

            # load our audio into an array
            audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)()

            # create the pool and the necessary algorithms
            pool = essentia.Pool()
            window = es.Windowing()
            energy = es.Energy()
            spectrum = es.Spectrum()
            centroid = es.Centroid(range=22050)
            rolloff = es.RollOff()
            crest = es.Crest()
            speak = es.StrongPeak()
            rmse = es.RMS()
            mfcc = es.MFCC()
            flux = es.Flux()
            barkbands = es.BarkBands( sampleRate = 44100)
            zerocrossingrate = es.ZeroCrossingRate()

            meta = es.MetadataReader(filename=complete_path + file, failOnError=True)()
            pool_meta, duration, bitrate, samplerate, channels = meta[7:]
            
            # centralmoments = es.SpectralCentralMoments()
            # distributionshape = es.DistributionShape()

            # compute the centroid for all frames in our audio and add it to the pool
            for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512):
                frame_windowed = window(frame)
                frame_spectrum = spectrum(frame_windowed)
                
                c = centroid(frame_spectrum)
                pool.add('spectral.centroid', c)

                cr = crest(frame_spectrum)
                pool.add('spectral crest', cr)

                r = rolloff(frame_spectrum)
                pool.add('spectral rolloff', r)

                sp = speak(frame_spectrum)
                pool.add('strong peak', sp)

                rms = rmse(frame_spectrum)
                pool.add('RMS', rms)

                pool.add('spectral_energy', energy(frame_spectrum))
                # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum)
                # pool.add('frame_MFCC', frame_mfcc)

                fl = flux(frame_spectrum)
                pool.add('spectral flux', fl)

                # bbands = barkbands(frame_spectrum)
                # pool.add('bark bands', bbands)

                zcr = zerocrossingrate(frame_spectrum)
                pool.add('zero crossing rate', zcr)

                # frame_centralmoments = centralmoments(power_spectrum)
                # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments)
                # pool.add('spectral_kurtosis', frame_kurtosis)
                # pool.add('spectral_spread', frame_spread)
                # pool.add('spectral_skewness', frame_skewness)

            # aggregate the results (find mean if needed)
            aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool)
            
            pool_meta.set("duration", duration)
            pool_meta.set("filename", os.path.relpath(file))

            # write pools to lists
            pool_arr = pool_to_array(aggrpool)
            result.append(pool_arr)

            meta_arr = pool_to_array(pool_meta)
            meta_result.append(meta_arr)
         
    features_df = pd.DataFrame.from_records(result)
    features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr']
    
    meta_df = pd.DataFrame.from_records(meta_result)
    meta_df.columns = ['duration','filename','metadata.tags.comment']
    del meta_df['metadata.tags.comment']

    return features_df,meta_df