def spectralFlux(audio,params): """ hop size, frame size, window type """ hopSize, frameSize, wtype = params w = Windowing(type=wtype) spec = Spectrum() result = [] Flux = ess.Flux() for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): sf = spec(w(frame)) result.append(Flux(sf)) return np.asarray(result),hopSize
w = es.Windowing(type='hann') spectrum = es.Spectrum() centroid = es.Centroid() moments = es.CentralMoments() # Temporal descriptors power = es.InstantPower() log_attack_time = es.LogAttackTime() effective_duration = es.EffectiveDuration() auto_correlation = es.AutoCorrelation() zero_crossing_rate = es.ZeroCrossingRate() # Spectral descriptors peak_freq = es.MaxMagFreq() roll_off = es.RollOff() flux = es.Flux() flatness = es.Flatness() # Harmonic descriptors pitch = es.PitchYin(frameSize=1024) spectral_peaks = es.SpectralPeaks(minFrequency=1e-5) harmonic_peaks = es.HarmonicPeaks() inharmonicity = es.Inharmonicity() oer = es.OddToEvenHarmonicEnergyRatio() tristimulus = es.Tristimulus() # MFCC mfcc = es.MFCC(inputSize=513) class Audio:
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = ess.ZeroCrossingRate() # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # spectral algorithms barkbands = ess.BarkBands(sampleRate=sampleRate) centralmoments = ess.CentralMoments() crest = ess.Crest() centroid = ess.Centroid() decrease = ess.Decrease() spectral_contrast = ess.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = ess.DistributionShape() energy = ess.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = ess.FlatnessDB() flux = ess.Flux() harmonic_peaks = ess.HarmonicPeaks() hfc = ess.HFC() mfcc = ess.MFCC() rolloff = ess.RollOff() rms = ess.RMS() strongpeak = ess.StrongPeak() # pitch algorithms pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = ess.PitchSalience() # dissonance spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = ess.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) #scPool = es.Pool() # pool for spectral contrast for frame in frames: frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate] # pool.setCurrentScope(frameScope) # silence rate # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum ** 2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) #scPool.add(namespace + '.' + 'scvalleys', sc_valleys) pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise ess.EssentiaError('This is a silent file!') #spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) # , pool.GlobalScope) # the code below is the same as the one above: # for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) # , pool.GlobalScope) progress.finish()
import numpy as np import matplotlib.pyplot as plt import essentia.standard as ess M = 1024 N = 1024 H = 512 fs = 44100 spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') flux = ess.Flux() onsetDetection = ess.OnsetDetection(method='hfc') x = ess.MonoLoader(filename='../../../sounds/speech-male.wav', sampleRate=fs)() fluxes = [] onsetDetections = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): mX = spectrum(window(frame)) flux_val = flux(mX) fluxes.append(flux_val) onsetDetection_val = onsetDetection(mX, mX) onsetDetections.append(onsetDetection_val) onsetDetections = np.array(onsetDetections) fluxes = np.array(fluxes) plt.figure(1, figsize=(9.5, 7)) plt.subplot(2, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude')
def compute_features(complete_path): result = [] meta_result = [] file_count = 0 # for loop over files for file in os.listdir(complete_path): if file.endswith(".wav"): file_count+=1 # print(file +' : ' + str(file_count)) # load our audio into an array audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)() # create the pool and the necessary algorithms pool = essentia.Pool() window = es.Windowing() energy = es.Energy() spectrum = es.Spectrum() centroid = es.Centroid(range=22050) rolloff = es.RollOff() crest = es.Crest() speak = es.StrongPeak() rmse = es.RMS() mfcc = es.MFCC() flux = es.Flux() barkbands = es.BarkBands( sampleRate = 44100) zerocrossingrate = es.ZeroCrossingRate() meta = es.MetadataReader(filename=complete_path + file, failOnError=True)() pool_meta, duration, bitrate, samplerate, channels = meta[7:] # centralmoments = es.SpectralCentralMoments() # distributionshape = es.DistributionShape() # compute the centroid for all frames in our audio and add it to the pool for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512): frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) c = centroid(frame_spectrum) pool.add('spectral.centroid', c) cr = crest(frame_spectrum) pool.add('spectral crest', cr) r = rolloff(frame_spectrum) pool.add('spectral rolloff', r) sp = speak(frame_spectrum) pool.add('strong peak', sp) rms = rmse(frame_spectrum) pool.add('RMS', rms) pool.add('spectral_energy', energy(frame_spectrum)) # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) # pool.add('frame_MFCC', frame_mfcc) fl = flux(frame_spectrum) pool.add('spectral flux', fl) # bbands = barkbands(frame_spectrum) # pool.add('bark bands', bbands) zcr = zerocrossingrate(frame_spectrum) pool.add('zero crossing rate', zcr) # frame_centralmoments = centralmoments(power_spectrum) # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) # pool.add('spectral_kurtosis', frame_kurtosis) # pool.add('spectral_spread', frame_spread) # pool.add('spectral_skewness', frame_skewness) # aggregate the results (find mean if needed) aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool) pool_meta.set("duration", duration) pool_meta.set("filename", os.path.relpath(file)) # write pools to lists pool_arr = pool_to_array(aggrpool) result.append(pool_arr) meta_arr = pool_to_array(pool_meta) meta_result.append(meta_arr) features_df = pd.DataFrame.from_records(result) features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr'] meta_df = pd.DataFrame.from_records(meta_result) meta_df.columns = ['duration','filename','metadata.tags.comment'] del meta_df['metadata.tags.comment'] return features_df,meta_df