def loadAudioFile(inputFilename, pool, options): sampleRate = options['sampleRate'] audio = essentia.MonoLoader(filename=inputFilename, sampleRate=sampleRate, downmix='mix')() #pool.setCurrentNamespace('metadata') # compute the temporal duration duration = essentia.Duration(sampleRate=options['sampleRate'])(audio) # trim audio if asked startTime = options['startTime'] endTime = options['endTime'] if startTime >= endTime: raise essentia.EssentiaError( 'In the configuration file, startTime should be lower or equal than endTime' ) startSample = int(options['sampleRate'] * startTime) try: endSample = int(options['sampleRate'] * endTime) except TypeError: endTime = duration endSample = int(options['sampleRate'] * duration) if startTime > duration: raise essentia.EssentiaError( 'The file is too short to be trimmed from second %d to second %d' % (startTime, endTime)) else: if endTime > duration: if startTime != 0.0: INFO('The file is being trimmed from second %d to second %d' % (startTime, duration)) audio = audio[startSample:] else: if startTime != 0.0 or endTime != duration: INFO('The file is being trimmed from second %d to second %d' % (startTime, endTime)) audio = audio[startSample:endSample] #pool.setGlobalScope([ 0.0, len(audio) / options['sampleRate'] ]) pool.add('metadata.duration', duration) #, pool.GlobalScope) pool.add('metadata.duration_processed', len(audio) / options['sampleRate']) #, pool.GlobalScope) # add sample rate and number of channels to pool pool.add('metadata.filename', inputFilename) #, pool.GlobalScope) pool.add('metadata.sample_rate', sampleRate) ##, pool.GlobalScope) #pool.add('channels', originalChannelsNumber, pool.GlobalScope) return audio
def preProcess(audio, pool, options, namespace=''): # which preprocessing preprocessing do we want to apply? preprocessing = toList(options['preprocessing']) # filtering and normalization for step in preprocessing: # do we remove the DC component? if step == 'dckiller': audio = essentia.DCRemoval()(audio) # do we normalize the audio? elif step == 'normalize': # compute replay gain first replayGain = essentia.ReplayGain(sampleRate = options['sampleRate'])(audio) pool.add(namespace + '.' + 'replay_gain', replayGain)#, pool.GlobalScope) # rescale audio if not silent (also apply a 6dB pre-amplification) if replayGain < 68.0: audio = essentia.Scale(factor = 10**((replayGain)/20))(audio) # do we apply an equal-loudness filter on all the audio? elif step == 'eqloud': audio = essentia.EqualLoudness(sampleRate = options['sampleRate'])(audio) else: raise essentia.EssentiaError('Unknown preprocessing step: \'%s\'' % step) return audio
def cleanStats(pool, options): # remove unwanted descriptors wantedStats = {} supportedStats = ['mean', 'min', 'max', 'var', 'dmean', 'dvar', 'dmean2',\ 'dvar2', 'value', 'copy', 'single_gaussian', 'cov', 'icov'] for extractor in options['specific']: if 'output' in options['specific'][extractor] and extractor in options[ 'generatedBy']: outputList = options['specific'][extractor]['output'] exec('import extractor.' + extractor + ' as extractor_module') namespace = extractor_module.namespace # check if we're not asking for some inexistent descriptor for descriptor in outputList: generated = options['generatedBy'][extractor] if descriptor not in generated: raise essentia.EssentiaError( 'Could not find descriptor \'' + descriptor + '\'. Available are: \'' + '\', \''.join(generated) + '\'') for descriptor in options['generatedBy'][extractor]: if descriptor not in outputList: #del pool.descriptors[namespace][descriptor] pool.remove(namespace + '.' + descriptor) else: try: wantedStats[namespace + '.' + descriptor] = options[ 'specific'][extractor]['output'][descriptor] except KeyError: wantedStats[namespace] = {} wantedStats[namespace + '.' + descriptor] = options[ 'specific'][extractor]['output'][descriptor] for (k, v) in wantedStats.items(): if not isinstance(v, list): wantedStats[k] = [v] stats = wantedStats[k] unwantedStats = [] for stat in stats: if stat not in supportedStats: unwantedStats += [stat] print 'Ignoring', stat, 'for', k, '. It is not supported.' if stat == 'single_gaussian': unwantedStats += [stat] wantedStats[k] += ['mean', 'cov', 'icov'] for stat in unwantedStats: wantedStats[k].remove(stat) metaDescs = descriptorNames(pool.descriptorNames(), 'metadata') wantedStats['lowlevel.spectral_contrast.mean'] = ['copy'] wantedStats['lowlevel.spectral_contrast.var'] = ['copy'] for desc in metaDescs: wantedStats['metadata' + '.' + desc] = ['copy'] return wantedStats
def compute(profile, inputFilename, outputFilename, userOptions = {}): # load profile profileDirectory = __file__.split(os.path.sep)[:-1] profileDirectory.append('profiles') profileDirectory.append('%s_config.yaml' % profile) try: # try to load the predefined profile, if it exists config = open(os.path.sep.join(profileDirectory), 'r').read() except: # otherwise, just load the file that was specified config = open(profile, 'r').read() options = yaml.load(config) mergeRecursiveDict(options, userOptions) # which format for the output? format = options['outputFormat'] if format not in [ 'xml', 'yaml' ]: raise essentia.EssentiaError('output format should be either \'xml\' or \'yaml\'') if format == 'xml': xmlOutput = True else: xmlOutput = False # we need this for dependencies checking options['computed'] = [] options['generatedBy'] = {} # get list of extractors to compute extractors = options['extractors'] # create pool & megalopool pool = essentia.Pool() # load audio file into memory audio = loadAudioFile(inputFilename, pool, options) # preprocess audio by applying a DC filter, normalization, etc... # preprocessing is a special step because it modifies the audio, hence it # must be executed before all the other extractors audio = preProcess(audio, pool, options, 'metadata') options['globalPreprocessing'] = options['preprocessing'] del options['preprocessing'] # process all extractors computeAllExtractors(extractors, audio, pool, options) # process segmentation if asked if options['segmentation']['doSegmentation']: segments = segmentation.compute(inputFilename, audio, pool, options) # remove unwanted descriptors wantedStats = cleanStats(pool, options) # add to megalopool #megalopool = essentia.Pool() scope = [ 0.0, len(audio)/options['sampleRate'] ] #megalopool.add('global', pool.aggregate_descriptors(wantedStats))#, scope) megalopool = essentia.PoolAggregator(exceptions=wantedStats)(pool) # special case for spectral contrast, which is only 1 matrix, therefore no # stats are computed: spectral_contrast_stats(megalopool, 'lowlevel.spectral_contrast', wantedStats) # plotting descriptors evolution try: if options['plots']: import plotting plotting.compute(inputFilename, audio, pool, options) except KeyError: pass # compute extractors on segments if options['segmentation']['doSegmentation']: if options['segmentation']['computeSegments']: if len(segments) == 0: megalopool.add('void', [0]) else: computeSegments(audio, segments, extractors, megalopool, options) # save to output file essentia.YamlOutput(filename=outputFilename)(megalopool)
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = essentia.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = essentia.ZeroCrossingRate() # frame algorithms frames = essentia.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = essentia.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = essentia.Spectrum(size=frameSize) # spectral algorithms barkbands = essentia.BarkBands(sampleRate=sampleRate) centralmoments = essentia.SpectralCentralMoments() crest = essentia.Crest() centroid = essentia.SpectralCentroid() decrease = essentia.SpectralDecrease() spectral_contrast = essentia.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = essentia.DistributionShape() energy = essentia.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = essentia.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = essentia.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = essentia.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = essentia.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = essentia.FlatnessDB() flux = essentia.Flux() harmonic_peaks = essentia.HarmonicPeaks() hfc = essentia.HFC() mfcc = essentia.MFCC() rolloff = essentia.RollOff() rms = essentia.RMS() strongpeak = essentia.StrongPeak() # pitch algorithms pitch_detection = essentia.PitchDetection(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = essentia.PitchSalience() # dissonance spectral_peaks = essentia.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = essentia.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = essentia.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) scPool = essentia.Pool() # pool for spectral contrast for frame in frames: frameScope = [ start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate ] #pool.setCurrentScope(frameScope) # silence rate pool.add(namespace + '.' + 'silence_rate_60dB', essentia.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and essentia.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum**2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) scPool.add(namespace + '.' + 'scvalleys', sc_valleys) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = essentia.CentralMoments( range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise essentia.EssentiaError('This is a silent file!') spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [ val / float(sum(midipitchhist)) for val in midipitchhist ] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = essentia.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) #, pool.GlobalScope) # the code below is the same as the one above: #for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = essentia.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) #, pool.GlobalScope) progress.finish()