def func_centr(audio, rate): spec = [] for frame in FrameGenerator(audio, frameSize=1024, hopSize=450, startFromZero=True): spec.append(Spectrum()(Windowing(type='hamming')(frame))) spec = np.array(spec) spec = spec.mean(axis=0) return (estd.Centroid(range=len(spec))(spec)) * rate / 1024.
def spectralCentroid(audio,params): """ hop size, frame size, window type """ hopSize, frameSize, wtype = params w = Windowing(type=wtype) spec = Spectrum() result = [] centroid = ess.Centroid(range=int(44100/2)) for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): sf = spec(w(frame)) result.append(centroid(sf)) return np.asarray(result),hopSize
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool
def feature_extractor_standard(audio_in, frameSize, hopSize, aggLen): #print('Starting Feature Extraction for %s',filename) #creating algorithm objects and pool objects win=es.Windowing() spec=es.Spectrum() centroid = es.Centroid() flatness = es.Flatness() mfcc=es.MFCC(lowFrequencyBound=40) pitchYin = es.PitchYinFFT() #Compute features frame by frame mfcc_ftrsArray = [] sCentroidArray = [] sFlatnessArray = [] pConfArray = [] for frame in es.FrameGenerator(audio_in, frameSize = frameSize, hopSize = hopSize): spectrum = spec(win(frame)) band_eneg, mfcc_ftrs=mfcc(spectrum) sCentroid = centroid(spectrum) sFlatness = flatness(spectrum) pitch, pitchConf = pitchYin(spectrum) #sFlux = flux(spectrum) mfcc_ftrsArray.append(mfcc_ftrs) sCentroidArray.append(sCentroid) sFlatnessArray.append(sFlatness) pConfArray.append(pitchConf) meanMFCC = [] varMFCC = [] meanCent = [] varCent = [] meanFlat = [] varFlat = [] meanPConf = [] varPConf = [] for ii in xrange(0, len(mfcc_ftrsArray)-aggLen,aggLen): meanMFCC.append(np.mean(mfcc_ftrsArray[ii:ii+aggLen],axis=0)) varMFCC.append(np.var(mfcc_ftrsArray[ii:ii+aggLen],axis=0)) meanCent.append(np.mean(sCentroidArray[ii:ii+aggLen])) varCent.append(np.var(sCentroidArray[ii:ii+aggLen])) meanFlat.append(np.mean(sFlatnessArray[ii:ii+aggLen])) varFlat.append(np.var(sFlatnessArray[ii:ii+aggLen])) meanPConf.append(np.mean(pConfArray[ii:ii+aggLen])) varPConf.append(np.var(pConfArray[ii:ii+aggLen])) return np.concatenate((np.array(meanMFCC), np.array(varMFCC), np.transpose(np.array(meanCent, ndmin=2)), np.transpose(np.array(varCent, ndmin=2)), np.transpose(np.array(meanFlat,ndmin=2)), np.transpose(np.array(varFlat,ndmin=2)), np.transpose(np.array(meanPConf,ndmin=2)), np.transpose(np.array(varPConf,ndmin=2))),axis=1)
def _key_fnc( sample: NDArray[Float32], frequency_rate: int, windowfnc: Window, key_type: KeyFunction, ): """ This function computes the key function, which in return calculates the keys for the [this.samples] map. To calculate the spectral centroid, the frequency_rate should be equal to the half of the samplerate. """ if key_type == KeyFunction.CENTROID: return _get_centroid( sample, estd.Centroid(range=frequency_rate), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MAX: return _get_max( sample, estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MFCC: return _get_mfcc( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS: return _get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), ) if key_type == KeyFunction.MELBANDS_LOG: return estd.UnaryOperator(type="log")(_get_melbands( sample, estd.MFCC(), estd.Spectrum(), estd.Windowing(type=windowfnc.value), )) raise ValueError("Keyfunction is not defined!")
def _get_features(audio_path): spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') centroid = ess.Centroid(range=1) x = ess.MonoLoader(filename=audio_path, sampleRate=fs)() spectrumcentroid = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): mX = spectrum(window(frame)) centroidvalues = centroid(mX) spectrumcentroid.append(centroidvalues) spectrumcentroid = np.array(spectrumcentroid) headers = ['mean_centroid'] features = [np.mean(spectrumcentroid)] #[np.mean(centroid)] #plt.figure(1, figsize=(9.5, 7)) # plt.subplot(2,1,1) # plt.plot(np.arange(x.size)/float(fs), x, 'b') # plt.axis([0, x.size/float(fs), min(x), max(x)]) # plt.ylabel('amplitude') # plt.title('x') # plt.subplot(2,1,2) # plt.plot(spectrumcentroid) # plt.ylabel('frequency (Hz)') # plt.title('time (sec)') # plt.autoscale(tight=True) # plt.tight_layout() # plt.savefig('centroid.png') # plt.show() return headers, features # print get_features('../processed_data/ed/ED003/PS/PS_LLL_1.wav') # print get_features('../../sms-tools/workspace/Tabla_test/ED003_PS_LLL_1.wav') # print get_features('../../sms-tools/workspace/Tabla_test/sine500hz.wav')
def sfxPitch(pool, namespace=''): sfxspace = 'sfx.' llspace = 'lowlevel.' if namespace: sfxspace = namespace + '.sfx.' llspace = namespace + '.lowlevel.' pitch = pool[llspace+'pitch'] gen = streaming.VectorInput(pitch) maxtt = streaming.MaxToTotal() mintt = streaming.MinToTotal() amt = streaming.AfterMaxToBeforeMaxEnergyRatio() gen.data >> maxtt.envelope gen.data >> mintt.envelope gen.data >> amt.pitch maxtt.maxToTotal >> (pool, sfxspace+'pitch_max_to_total') mintt.minToTotal >> (pool, sfxspace+'pitch_min_to_total') amt.afterMaxToBeforeMaxEnergyRatio >> (pool, sfxspace+'pitch_after_max_to_before_max_energy_ratio') essentia.run(gen) pc = standard.Centroid(range=len(pitch)-1)(pitch) pool.set(sfxspace+'pitch_centroid', pc)
import essentia import essentia.standard as es import numpy as np import scipy.signal FS = 44100 w = es.Windowing(type='hann') spectrum = es.Spectrum() centroid = es.Centroid() moments = es.CentralMoments() # Temporal descriptors power = es.InstantPower() log_attack_time = es.LogAttackTime() effective_duration = es.EffectiveDuration() auto_correlation = es.AutoCorrelation() zero_crossing_rate = es.ZeroCrossingRate() # Spectral descriptors peak_freq = es.MaxMagFreq() roll_off = es.RollOff() flux = es.Flux() flatness = es.Flatness() # Harmonic descriptors pitch = es.PitchYin(frameSize=1024) spectral_peaks = es.SpectralPeaks(minFrequency=1e-5) harmonic_peaks = es.HarmonicPeaks() inharmonicity = es.Inharmonicity() oer = es.OddToEvenHarmonicEnergyRatio()
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = ess.ZeroCrossingRate() # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # spectral algorithms barkbands = ess.BarkBands(sampleRate=sampleRate) centralmoments = ess.CentralMoments() crest = ess.Crest() centroid = ess.Centroid() decrease = ess.Decrease() spectral_contrast = ess.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = ess.DistributionShape() energy = ess.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = ess.FlatnessDB() flux = ess.Flux() harmonic_peaks = ess.HarmonicPeaks() hfc = ess.HFC() mfcc = ess.MFCC() rolloff = ess.RollOff() rms = ess.RMS() strongpeak = ess.StrongPeak() # pitch algorithms pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = ess.PitchSalience() # dissonance spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = ess.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) #scPool = es.Pool() # pool for spectral contrast for frame in frames: frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate] # pool.setCurrentScope(frameScope) # silence rate # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum ** 2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) #scPool.add(namespace + '.' + 'scvalleys', sc_valleys) pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise ess.EssentiaError('This is a silent file!') #spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) # , pool.GlobalScope) # the code below is the same as the one above: # for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) # , pool.GlobalScope) progress.finish()
def temporal_centroid(self, audio): envelope = es.Envelope() temporal = es.Centroid(range=(float(len(audio) - 1) / 44100)) return temporal(envelope(audio))
import numpy as np import matplotlib.pyplot as plt import essentia.standard as ess M = 1024 N = 1024 H = 512 fs = 44100 spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') centroid = ess.Centroid(range=fs / 2.0) x = ess.MonoLoader(filename='../../../sounds/speech-male.wav', sampleRate=fs)() centroids = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): mX = spectrum(window(frame)) centroid_val = centroid(mX) centroids.append(centroid_val) centroids = np.array(centroids) plt.figure(1, figsize=(9.5, 5)) plt.subplot(2, 1, 1) plt.plot(np.arange(x.size) / float(fs), x) plt.axis([0, x.size / float(fs), min(x), max(x)]) plt.ylabel('amplitude') plt.title('x (speech-male.wav)') plt.subplot(2, 1, 2) frmTime = H * np.arange(centroids.size) / float(fs) plt.plot(frmTime, centroids, 'g', lw=1.5)
# This processing (top freq peaks) only works for single speaker case... need better features for multispeaker! # MFCC (or deep NN/automatic feature extraction) could be interesting inputSize = (data.shape[1] - 1) * 2 M = 1024 N = 1024 H = 256 fs = 8000 spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') mfcc = ess.MFCC(numberCoefficients=7, inputSize=inputSize / 2 + 1) sc = ess.SpectralContrast(frameSize=inputSize) cent = ess.Centroid() """n_dim = 6 all_obs = np.zeros((data.shape[0], n_dim)) for r in range(data.shape[0]): #obs = np.zeros((n_dim, 1)) _, t = peakfind(data[r, :], n_peaks=n_dim) all_obs[r, :] = t.copy() #all_obs = np.atleast_3d(all_obs)""" n_dim = 13 all_obs = np.zeros((data.shape[0], n_dim)) for r in range(data.shape[0]): mX = essentia.array(data[r, :]) mfcc_bands, mfcc_coeffs = mfcc(mX)
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) E = [] numFrames = 0 for frame in frames: numFrames += 1 E_frame = energy(frame) E.append(E_frame) E_max = np.max(E) frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pools = [(t, es.Pool()) for t in dscr.threshold] for frame in frames: eNorm = energy(frame) / E_max threshPools = [] for t, pool in pools: if eNorm >= t: threshPools.append(pool) mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools] #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools] pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) [pool.add('lowlevel.dissonance', diss) for pool in threshPools] pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) [pool.add('sfx.inharmonicity', inharm) for pool in threshPools] sc_coeffs, sc_valleys = spectral_contrast(mX) [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools] c = centroid(mX) [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools] lat = log_attack_time(frame) [pool.add('sfx.logattacktime', lat) for pool in threshPools] h = hfc(mX) [pool.add('lowlevel.hfc', h) for pool in threshPools] spec_complx = spectral_complexity(mX) [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools] #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean']) aggrPools = [calc_Mean_Var(pool) for t, pool in pools] features = {} [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))] json.dump(features, open(outputJsonFile, 'w'))
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ #help(ess.SpectralContrast) """ orig M = 1024 N = 1024 H = 512 fs = 44100 W = 'hann' """ """ freesound Real sampleRate = 44100; int frameSize = 2048; int hopSize = 1024; int zeroPadding = 0; string silentFrames ="noise"; string windowType = "blackmanharris62"; // Silence Rate Real thresholds_dB[] = { -20, -30, -60 }; vector<Real> thresholds(ARRAY_SIZE(thresholds_dB)); for (uint i=0; i<thresholds.size(); i++) { thresholds[i] = db2lin(thresholds_dB[i]/2.0); } """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #silentFrames = "noise" #thresholds_dB = np.array([ -20, -30, -60 ]) #thresholds = np.power (10.0, thresholds_dB / 20) #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pool = es.Pool() for frame in frames: mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) pool.add('lowlevel.dissonance', diss) pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) pool.add('sfx.inharmonicity', inharm) sc_coeffs, sc_valleys = spectral_contrast(mX) pool.add('lowlevel.spectral_contrast', sc_coeffs) c = centroid(mX) pool.add('lowlevel.spectral_centroid', c) lat = log_attack_time(frame) pool.add('sfx.logattacktime', lat) h = hfc(mX) pool.add('lowlevel.hfc', h) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) aggrPool = calc_Mean_Var(pool) features = makeFeatures(aggrPool) json.dump(features, open(outputJsonFile, 'w'))
def compute_features(complete_path): result = [] meta_result = [] file_count = 0 # for loop over files for file in os.listdir(complete_path): if file.endswith(".wav"): file_count+=1 # print(file +' : ' + str(file_count)) # load our audio into an array audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)() # create the pool and the necessary algorithms pool = essentia.Pool() window = es.Windowing() energy = es.Energy() spectrum = es.Spectrum() centroid = es.Centroid(range=22050) rolloff = es.RollOff() crest = es.Crest() speak = es.StrongPeak() rmse = es.RMS() mfcc = es.MFCC() flux = es.Flux() barkbands = es.BarkBands( sampleRate = 44100) zerocrossingrate = es.ZeroCrossingRate() meta = es.MetadataReader(filename=complete_path + file, failOnError=True)() pool_meta, duration, bitrate, samplerate, channels = meta[7:] # centralmoments = es.SpectralCentralMoments() # distributionshape = es.DistributionShape() # compute the centroid for all frames in our audio and add it to the pool for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512): frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) c = centroid(frame_spectrum) pool.add('spectral.centroid', c) cr = crest(frame_spectrum) pool.add('spectral crest', cr) r = rolloff(frame_spectrum) pool.add('spectral rolloff', r) sp = speak(frame_spectrum) pool.add('strong peak', sp) rms = rmse(frame_spectrum) pool.add('RMS', rms) pool.add('spectral_energy', energy(frame_spectrum)) # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) # pool.add('frame_MFCC', frame_mfcc) fl = flux(frame_spectrum) pool.add('spectral flux', fl) # bbands = barkbands(frame_spectrum) # pool.add('bark bands', bbands) zcr = zerocrossingrate(frame_spectrum) pool.add('zero crossing rate', zcr) # frame_centralmoments = centralmoments(power_spectrum) # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) # pool.add('spectral_kurtosis', frame_kurtosis) # pool.add('spectral_spread', frame_spread) # pool.add('spectral_skewness', frame_skewness) # aggregate the results (find mean if needed) aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool) pool_meta.set("duration", duration) pool_meta.set("filename", os.path.relpath(file)) # write pools to lists pool_arr = pool_to_array(aggrpool) result.append(pool_arr) meta_arr = pool_to_array(pool_meta) meta_result.append(meta_arr) features_df = pd.DataFrame.from_records(result) features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr'] meta_df = pd.DataFrame.from_records(meta_result) meta_df.columns = ['duration','filename','metadata.tags.comment'] del meta_df['metadata.tags.comment'] return features_df,meta_df
def compute(audio, pool, options): INFO('Computing SFX descriptors...') # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # pitch algorithm pitch_detection = ess.PitchYinFFT(frameSize=2048, sampleRate=sampleRate) # sfx descriptors spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() odd2evenharmonicenergyratio = ess.OddToEvenHarmonicEnergyRatio() tristimulus = ess.Tristimulus() # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 progress = Progress(total=total_frames) for frame in frames: frameScope = [ start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate ] # pool.setCurrentScope(frameScope) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) # spectral peaks based descriptors frame_frequencies, frame_magnitudes = spectral_peaks(frame_spectrum) # ERROR CORRECTION - hoinx 2015-12 errIdx = np.where(frame_frequencies < 1) frame_frequencies = np.delete(frame_frequencies, errIdx) frame_magnitudes = np.delete(frame_magnitudes, errIdx) (frame_harmonic_frequencies, frame_harmonic_magnitudes) = harmonic_peaks(frame_frequencies, frame_magnitudes, frame_pitch) if len(frame_harmonic_frequencies) > 1: frame_inharmonicity = inharmonicity(frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'inharmonicity', frame_inharmonicity) frame_tristimulus = tristimulus(frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'tristimulus', frame_tristimulus) frame_odd2evenharmonicenergyratio = odd2evenharmonicenergyratio( frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'odd2evenharmonicenergyratio', frame_odd2evenharmonicenergyratio) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize envelope = ess.Envelope() file_envelope = envelope(audio) # temporal statistics decrease = ess.Decrease() pool.add(namespace + '.' + 'temporal_decrease', decrease(file_envelope)) # , pool.GlobalScope) centralmoments = ess.CentralMoments() file_centralmoments = centralmoments(file_envelope) distributionshape = ess.DistributionShape() (file_spread, file_skewness, file_kurtosis) = distributionshape(file_centralmoments) pool.add(namespace + '.' + 'temporal_spread', file_spread) # , pool.GlobalScope) pool.add(namespace + '.' + 'temporal_skewness', file_skewness) # , pool.GlobalScope) pool.add(namespace + '.' + 'temporal_kurtosis', file_kurtosis) # , pool.GlobalScope) centroid = ess.Centroid() pool.add(namespace + '.' + 'temporal_centroid', centroid(file_envelope)) # , pool.GlobalScope) # effective duration effectiveduration = ess.EffectiveDuration() pool.add(namespace + '.' + 'effective_duration', effectiveduration(file_envelope)) # , pool.GlobalScope) # log attack time logattacktime = ess.LogAttackTime() pool.add(namespace + '.' + 'logattacktime', logattacktime(audio)) # , pool.GlobalScope) # strong decay strongdecay = ess.StrongDecay() pool.add(namespace + '.' + 'strongdecay', strongdecay(file_envelope)) # , pool.GlobalScope) # dynamic profile flatness = ess.FlatnessSFX() pool.add(namespace + '.' + 'flatness', flatness(file_envelope)) # , pool.GlobalScope) """ # onsets number onsets_number = len(pool['rhythm.onset_times'][0]) pool.add(namespace + '.' + 'onsets_number', onsets_number) # , pool.GlobalScope) """ # morphological descriptors max_to_total = ess.MaxToTotal() pool.add(namespace + '.' + 'max_to_total', max_to_total(file_envelope)) # , pool.GlobalScope) tc_to_total = ess.TCToTotal() pool.add(namespace + '.' + 'tc_to_total', tc_to_total(file_envelope)) # , pool.GlobalScope) derivativeSFX = ess.DerivativeSFX() (der_av_after_max, max_der_before_max) = derivativeSFX(file_envelope) pool.add(namespace + '.' + 'der_av_after_max', der_av_after_max) # , pool.GlobalScope) pool.add(namespace + '.' + 'max_der_before_max', max_der_before_max) # , pool.GlobalScope) # pitch profile """ pitch = pool['lowlevel.pitch'] if len(pitch) > 1: pool.add(namespace + '.' + 'pitch_max_to_total', max_to_total(pitch)) # , pool.GlobalScope) min_to_total = ess.MinToTotal() pool.add(namespace + '.' + 'pitch_min_to_total', min_to_total(pitch)) # , pool.GlobalScope) pitch_centroid = ess.Centroid(range=len(pitch) - 1) pool.add(namespace + '.' + 'pitch_centroid', pitch_centroid(pitch)) # , pool.GlobalScope) pitch_after_max_to_before_max_energy_ratio = ess.AfterMaxToBeforeMaxEnergyRatio() pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', pitch_after_max_to_before_max_energy_ratio(pitch)) # , pool.GlobalScope) else: pool.add(namespace + '.' + 'pitch_max_to_total', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_min_to_total', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_centroid', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', 0.0) # , pool.GlobalScope) """ progress.finish()