def energyThresholdAudio(soundfilesList): for sound in soundfilesList: RMS = esst.RMS() audioLoader = esst.MonoLoader(filename=sound) audio = audioLoader() start=0 end=0 thresh=0.05 rms_vals=[] for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True): rms = RMS(frame) rms_vals.append(float(rms)) rms_vals = np.array(rms_vals) higher=np.where(rms_vals >= thresh)[0] if len(higher) > 1: start=higher[0] end=higher[-1] else: continue newAudio = audio[start*1024:end*1024] writer = esst.MonoWriter(filename=sound, format="mp3") writer(newAudio) print (sound)
def rms(audio,params): """ hop size, frame size, window type """ hopSize, frameSize, wtype = params w = Windowing(type=wtype) spec = Spectrum() result = [] RMS = ess.RMS() for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): sf = spec(w(frame)) result.append(RMS(sf)) return np.asarray(result),hopSize
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool
def postProcessTicks(audioFilename, ticks, ticksAmp, pool): '''Computes delta energy in order to find the correct position of the ticks''' # get rid of beats of beats > audio.length # if t < 0 or t > pool['length']: continue # ticks.append(float(t)) # ticksAmp.append(float(amp)) #ticks = essentia.postProcessTicks(ticks, ticksAmp, 60./pool['harmonicBpm'][0]); beatWindowDuration = 0.01 # seconds beatDuration = 0.005 # seconds rmsFrameSize = 64 rmsHopSize = rmsFrameSize / 2 audio = std.MonoLoader(filename=audioFilename, sampleRate=pool['samplerate'], downmix=pool['downmix'])() for i, tick in enumerate(ticks): startTime = tick - beatWindowDuration / 2.0 if startTime < 0: startTime = 0 endTime = startTime + beatWindowDuration + beatDuration + 0.0001 slice = std.Trimmer(sampleRate=pool['samplerate'], startTime=startTime, endTime=endTime)(audio) frames = std.FrameGenerator(slice, frameSize=rmsFrameSize, hopSize=rmsHopSize) maxDeltaRms = 0 RMS = std.RMS() prevRms = 0 pos = 0 tickPos = pos for frame in frames: rms = RMS(frame) diff = rms - prevRms if diff > maxDeltaRms: tickPos = pos maxDeltaRms = diff pos += 1 prevRms = rms ticks[i] = tick + tickPos * float(rmsHopSize) / pool['samplerate'] return ticks
def highEnergyFrames(audioIn, threshold=0.05, strip=False): # strip: if True, only low-energy frames at the beginning and at the end are discarded RMS = ess.RMS() highEnergyAudio = [] frames = ess.FrameGenerator(audioIn, frameSize=2048, hopSize=1024, startFromZero=True) rmsValues = np.array([float(RMS(frame)) for frame in frames]) highRMSFrames = np.where(rmsValues > threshold)[0] outSamples = [frame * 1024 for frame in highRMSFrames] if strip: # return the middle section of the audio highEnergyAudio = audioIn[outSamples[0]:outSamples[-1]] if len( highRMSFrames > 1) else audioIn else: # return all high-energy samples highEnergyAudio = audioIn[outSamples] return highEnergyAudio
## Tabla analysis and synthesis module for the HAMR 2015 ISMIR hack import os import essentia as es import essentia.standard as ess import numpy as np import pickle import glob import utilFunctions as UF import scipy.spatial.distance as DS import parameters as params import csv rms=ess.RMS() window = ess.Windowing(type = "hamming") spec = ess.Spectrum(size=params.Nfft) zz = np.zeros((params.zeropadLen,), dtype = 'float32') genmfcc = ess.MFCC(highFrequencyBound = 22000.0, inputSize = params.Nfft/2+1, sampleRate = params.Fs) hps = ess.HighPass(cutoffFrequency = 240.0) onsets = ess.Onsets() strokeLabels = ['dha', 'dhen', 'dhi', 'dun', 'ge', 'kat', 'ke', 'na', 'ne', 're', 'tak', 'te', 'tit', 'tun'] taals = {"teen": {"nmatra": 16, "accents": np.array([4, 1, 1, 1, 3, 1, 1, 1, 2, 1, 1, 1, 3, 1, 1, 1])}, "ek": {"nmatra": 12, "accents": np.array([4, 1, 1, 2, 1, 1, 3, 1, 1, 2, 1, 1])}, "jhap": {"nmatra": 10, "accents": np.array([4, 1, 2, 1, 1, 3, 1, 2, 1, 1])}, "rupak": {"nmatra": 7, "accents": np.array([2, 1, 1, 3, 1, 3, 1])} } rolls = [{"bol": ['dha/dha_02', 'te/te_05', 're/re_04', 'dha/dha_02'], "dur": np.array([1.0, 1.0, 1, 1]), "amp": np.array([1.0, 1.0, 1.0, 1.0])},
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = ess.ZeroCrossingRate() # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # spectral algorithms barkbands = ess.BarkBands(sampleRate=sampleRate) centralmoments = ess.CentralMoments() crest = ess.Crest() centroid = ess.Centroid() decrease = ess.Decrease() spectral_contrast = ess.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = ess.DistributionShape() energy = ess.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = ess.FlatnessDB() flux = ess.Flux() harmonic_peaks = ess.HarmonicPeaks() hfc = ess.HFC() mfcc = ess.MFCC() rolloff = ess.RollOff() rms = ess.RMS() strongpeak = ess.StrongPeak() # pitch algorithms pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = ess.PitchSalience() # dissonance spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = ess.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) #scPool = es.Pool() # pool for spectral contrast for frame in frames: frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate] # pool.setCurrentScope(frameScope) # silence rate # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum ** 2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) #scPool.add(namespace + '.' + 'scvalleys', sc_valleys) pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise ess.EssentiaError('This is a silent file!') #spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) # , pool.GlobalScope) # the code below is the same as the one above: # for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) # , pool.GlobalScope) progress.finish()
def rms(self, audio): rmsFunc = es.RMS() return 20 * math.log10(rmsFunc(audio))
import matplotlib.pyplot as plt mbid = 'ead85d20-ce7d-4ed0-a00d-0ae199b94d12' hopSize = 128 loader = es.MonoLoader(filename=os.path.join(mbid, mbid + '-voice.mp3')) track = loader() # track[track<0.000001] = 0 print(len(track) / 44100.) # pY = es.PitchYin(minFrequency=55, maxFrequency=900, tolerance=0.06) pY = es.PitchYin(minFrequency=55, maxFrequency=600, tolerance=0.03) rms = es.RMS() pitch = [] loudness = [] print('Computing pitch and loudness') for frame in es.FrameGenerator(track, frameSize=2048, hopSize=hopSize, startFromZero=True): f = pY(frame) if f[1] >= 0.8: pitch.append(f[0]) loudness.append(rms(frame)) else: pitch.append(0)
def compute_features(complete_path): result = [] meta_result = [] file_count = 0 # for loop over files for file in os.listdir(complete_path): if file.endswith(".wav"): file_count+=1 # print(file +' : ' + str(file_count)) # load our audio into an array audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)() # create the pool and the necessary algorithms pool = essentia.Pool() window = es.Windowing() energy = es.Energy() spectrum = es.Spectrum() centroid = es.Centroid(range=22050) rolloff = es.RollOff() crest = es.Crest() speak = es.StrongPeak() rmse = es.RMS() mfcc = es.MFCC() flux = es.Flux() barkbands = es.BarkBands( sampleRate = 44100) zerocrossingrate = es.ZeroCrossingRate() meta = es.MetadataReader(filename=complete_path + file, failOnError=True)() pool_meta, duration, bitrate, samplerate, channels = meta[7:] # centralmoments = es.SpectralCentralMoments() # distributionshape = es.DistributionShape() # compute the centroid for all frames in our audio and add it to the pool for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512): frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) c = centroid(frame_spectrum) pool.add('spectral.centroid', c) cr = crest(frame_spectrum) pool.add('spectral crest', cr) r = rolloff(frame_spectrum) pool.add('spectral rolloff', r) sp = speak(frame_spectrum) pool.add('strong peak', sp) rms = rmse(frame_spectrum) pool.add('RMS', rms) pool.add('spectral_energy', energy(frame_spectrum)) # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) # pool.add('frame_MFCC', frame_mfcc) fl = flux(frame_spectrum) pool.add('spectral flux', fl) # bbands = barkbands(frame_spectrum) # pool.add('bark bands', bbands) zcr = zerocrossingrate(frame_spectrum) pool.add('zero crossing rate', zcr) # frame_centralmoments = centralmoments(power_spectrum) # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) # pool.add('spectral_kurtosis', frame_kurtosis) # pool.add('spectral_spread', frame_spread) # pool.add('spectral_skewness', frame_skewness) # aggregate the results (find mean if needed) aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool) pool_meta.set("duration", duration) pool_meta.set("filename", os.path.relpath(file)) # write pools to lists pool_arr = pool_to_array(aggrpool) result.append(pool_arr) meta_arr = pool_to_array(pool_meta) meta_result.append(meta_arr) features_df = pd.DataFrame.from_records(result) features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr'] meta_df = pd.DataFrame.from_records(meta_result) meta_df.columns = ['duration','filename','metadata.tags.comment'] del meta_df['metadata.tags.comment'] return features_df,meta_df