self.outputs['mfcc'] = mfcc.mfcc if __name__ == '__main__': # Make sure the command was well-formed. if len(sys.argv) < 3: print 'Usage: extractor_mfcc.py <input audio filename> <output yaml filename>' sys.exit(1) # Loaders must be specified outside your composite algorithm. loader = essentia.streaming.MonoLoader(filename=sys.argv[1]) # We are using the default values of our parameters so we don't specify any keyword arguments. mfccex = ExtractorMfcc() p = essentia.Pool() # When connecting to/from your composite algorithm, use the names you declared in the # self.inputs and self.outputs dictionaries, respectively. loader.audio >> mfccex.audio mfccex.mfcc >> (p, 'mfcc') essentia.run(loader) # CompoxiteBase algorithms can be translated into c++ code and dot graphs # can also be generated: essentia.translate(ExtractorMfcc, # algorithm to be translated 'myExtractorMfcc', # output name for the c++ and dot generated files dot_graph=True) # whether dot file should be generated essentia.standard.YamlOutput(filename=sys.argv[2])(p)
def featureExtractFile(curFile): import sys import numpy import essentia from essentia.streaming import MonoLoader from essentia.streaming import LowLevelSpectralExtractor from essentia.standard import YamlOutput from essentia.standard import YamlInput from essentia.standard import PoolAggregator from essentia.streaming import FrameCutter from essentia.streaming import AutoCorrelation import pickle filename = '/home/user/Desktop/soundsDB2/classifier/featureExtractionEssentia/frameSize.npz' npz = numpy.load(filename) frameSize = int(npz['frameSize']) # and instantiate our algorithms loader = MonoLoader(filename=curFile, sampleRate=8000) framecutter = FrameCutter(frameSize=frameSize, hopSize=frameSize / 4) autoCorrelator = AutoCorrelation() lowLevelExtractor = LowLevelSpectralExtractor(frameSize=frameSize, hopSize=frameSize / 4, sampleRate=8000) pool = essentia.Pool() loader.audio >> lowLevelExtractor.signal lowLevelExtractor.barkbands >> (pool, curFile[:-4] + '.barkbands') lowLevelExtractor.barkbands_kurtosis >> (pool, curFile[:-4] + '.barkbands_kurtosis') lowLevelExtractor.barkbands_skewness >> (pool, curFile[:-4] + '.barkbands_skewness') lowLevelExtractor.barkbands_spread >> (pool, curFile[:-4] + '.barkbands_spread') lowLevelExtractor.hfc >> (pool, curFile[:-4] + '.hfc') lowLevelExtractor.mfcc >> (pool, curFile[:-4] + '.mfcc') lowLevelExtractor.pitch >> (pool, curFile[:-4] + '.pitch') lowLevelExtractor.pitch_instantaneous_confidence >> ( pool, curFile[:-4] + '.pitch_instantaneous_confidence') lowLevelExtractor.pitch_salience >> (pool, curFile[:-4] + '.pitch_salience') lowLevelExtractor.silence_rate_20dB >> (pool, curFile[:-4] + '.silence_rate_20dB') lowLevelExtractor.silence_rate_30dB >> (pool, curFile[:-4] + '.silence_rate_30dB ') lowLevelExtractor.silence_rate_60dB >> (pool, curFile[:-4] + '.silence_rate_60dB') lowLevelExtractor.spectral_complexity >> (pool, curFile[:-4] + '.spectral_complexity') lowLevelExtractor.spectral_crest >> (pool, curFile[:-4] + '.spectral_crest') lowLevelExtractor.spectral_decrease >> (pool, curFile[:-4] + '.spectral_decrease') lowLevelExtractor.spectral_energy >> (pool, curFile[:-4] + '.spectral_energy') lowLevelExtractor.spectral_energyband_low >> (pool, curFile[:-4] + '.spectral_energyband_low') lowLevelExtractor.spectral_energyband_middle_low >> ( pool, curFile[:-4] + '.spectral_energyband_middle_low') lowLevelExtractor.spectral_energyband_middle_high >> ( pool, curFile[:-4] + '.spectral_energyband_middle_high') lowLevelExtractor.spectral_energyband_high >> None lowLevelExtractor.spectral_flatness_db >> (pool, curFile[:-4] + '.spectral_flatness_db') lowLevelExtractor.spectral_flux >> (pool, curFile[:-4] + '.spectral_flux') lowLevelExtractor.spectral_rms >> (pool, curFile[:-4] + '.spectral_rms') lowLevelExtractor.spectral_rolloff >> (pool, curFile[:-4] + '.spectral_rolloff') lowLevelExtractor.spectral_strongpeak >> (pool, curFile[:-4] + '.spectral_strongpeak') lowLevelExtractor.zerocrossingrate >> (pool, curFile[:-4] + '.zerocrossingrate') lowLevelExtractor.inharmonicity >> (pool, curFile[:-4] + '.inharmonicity') lowLevelExtractor.tristimulus >> (pool, curFile[:-4] + '.tristimulus') lowLevelExtractor.oddtoevenharmonicenergyratio >> ( pool, curFile[:-4] + '.oddtoevenharmonicenergyratio') lowLevelExtractor.inharmonicity >> None lowLevelExtractor.tristimulus >> None lowLevelExtractor.oddtoevenharmonicenergyratio >> None #mfcc.bands >> (pool, curFile[:-4]+'.mfccBands') #mfcc.mfcc >> (pool, curFile[:-4]+'.mfcc') essentia.run(loader) aggrPool = PoolAggregator(defaultStats=[ 'min', 'max', 'median', 'mean', 'var', 'skew', 'kurt', 'dmean', 'dvar' ])(pool) #aggrPool = PoolAggregator(defaultStats = ['min', 'max', 'mean', 'var'])(pool) YamlOutput(filename=curFile[:-4] + 'trainingFeatures.yaml', format="yaml")(aggrPool) essentia.reset(loader) return
if options.input_file is None: print(usage) sys.exit(1) return options, args if __name__ == '__main__': import sys, os.path, essentia options, args = parse_args() input_file = options.input_file # load audio file audio_file = essentia.AudioFileInput(filename=input_file) audio = audio_file() sampleRate = 44100. pool = essentia.Pool(input_file) if options.ground_truth_file is not None: import yaml if 'CLoader' in dir(yaml): load = lambda x: yaml.load(x, yaml.CLoader) load_all = lambda x: yaml.load_all(x, yaml.CLoader) else: load = yaml.load load_all = yaml.load_all if 'CDumper' in dir(yaml): dump = lambda x: yaml.dump(x, Dumper=yaml.CDumper) else: dump = yaml.dump metadata = load(open(options.ground_truth_file)) # add ground truth to pool
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = essentia.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = essentia.ZeroCrossingRate() # frame algorithms frames = essentia.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = essentia.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = essentia.Spectrum(size=frameSize) # spectral algorithms barkbands = essentia.BarkBands(sampleRate=sampleRate) centralmoments = essentia.SpectralCentralMoments() crest = essentia.Crest() centroid = essentia.SpectralCentroid() decrease = essentia.SpectralDecrease() spectral_contrast = essentia.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = essentia.DistributionShape() energy = essentia.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = essentia.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = essentia.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = essentia.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = essentia.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = essentia.FlatnessDB() flux = essentia.Flux() harmonic_peaks = essentia.HarmonicPeaks() hfc = essentia.HFC() mfcc = essentia.MFCC() rolloff = essentia.RollOff() rms = essentia.RMS() strongpeak = essentia.StrongPeak() # pitch algorithms pitch_detection = essentia.PitchDetection(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = essentia.PitchSalience() # dissonance spectral_peaks = essentia.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = essentia.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = essentia.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) scPool = essentia.Pool() # pool for spectral contrast for frame in frames: frameScope = [ start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate ] #pool.setCurrentScope(frameScope) # silence rate pool.add(namespace + '.' + 'silence_rate_60dB', essentia.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and essentia.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum**2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) scPool.add(namespace + '.' + 'scvalleys', sc_valleys) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = essentia.CentralMoments( range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise essentia.EssentiaError('This is a silent file!') spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [ val / float(sum(midipitchhist)) for val in midipitchhist ] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = essentia.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) #, pool.GlobalScope) # the code below is the same as the one above: #for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = essentia.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) #, pool.GlobalScope) progress.finish()
def getStereoPanningSpectrum(_audio): w_l = Windowing(type='hann') stereoDemuxer = StereoDemuxer() spectrum_l = FFT(size=kN) w_r = Windowing(type='hann') stereoDemuxer = StereoDemuxer() spectrum_r = FFT(size=kN) pool = essentia.Pool() rms = RMS() freq_1 = int(np.round((250 * kN + 2) / kSampleRate)) freq_2 = int(np.round((2500 * kN + 2) / kSampleRate)) left, right = stereoDemuxer(_audio) if not np.any(right): right = left frame_l = FrameGenerator(left, frameSize=kN, hopSize=kN // 2) frame_r = FrameGenerator(right, frameSize=kN, hopSize=kN // 2) for _frame_l, _frame_r in zip(frame_l, frame_r): # Calculates Stereo Panning Spectrum l = spectrum_l(w_l(_frame_l)) r = spectrum_r(w_r(_frame_r)) phi_l = np.abs(l * np.conj(r)) / (np.abs(l)**2) phi_r = np.abs(r * np.conj(l)) / (np.abs(r)**2) phi = 2 * np.abs(l * np.conj(r)) / (np.abs(l)**2 + np.abs(r)**2) delta = phi_l - phi_r delta_ = [] for bin in delta: if bin > 0: delta_.append(1) elif bin < 0: delta_.append(-1) else: delta_.append(0) SPS = (1 - phi) * delta_ SPS = essentia.array(SPS) pool.add('panning.SPS', SPS) P_total = rms(SPS) P_low = rms(SPS[0:freq_1]) P_medium = rms(SPS[freq_1:freq_2]) P_high = rms(SPS[freq_2::]) pool.add('panning.P_total', P_total) pool.add('panning.P_low', P_low) pool.add('panning.P_medium', P_medium) pool.add('panning.P_high', P_high) #Calculates Stereo Phase Spread: frequencies = np.linspace(1, (kN / 2) + 1, (kN / 2) + 1) * (kSampleRate) / (kN + 2) erb = erbScale(30, 11025, 40) phase_l = np.angle(l) phase_r = np.angle(r) mag_l = np.abs(l) mag_r = np.abs(r) pool2 = essentia.Pool() for erb_f0 in erb: freqs = np.asarray([]) for f in frequencies: if find_nearest(erb, f) == erb_f0: freqs = np.append(freqs, f) elif freqs.size != 0: break freq1 = int(np.round((freqs[0] * kN + 2) / kSampleRate)) freq2 = int(np.round((freqs[-1] * kN + 2) / kSampleRate)) if freq2 == kN / 2: freq2 = freq2 + 1 S_l = np.cos(2 * np.pi * (freqs / kSampleRate) + phase_l[freq1 - 1:freq2]) S_r = np.cos(2 * np.pi * (freqs / kSampleRate) + phase_r[freq1 - 1:freq2]) a_weight = np.mean(mag_l[freq1 - 1:freq2] + mag_r[freq1 - 1:freq2]) delta_lr = a_weight * np.std(S_l - S_r) / np.std(S_l + S_r) if freq2 - freq1 == 0: #delta_lr = a_weight * np.mean(S_l - S_r) / np.mean(S_l + S_r) delta_lr = 0 pool2.add('a', delta_lr) pool.add('panning.SSPS', pool2['a']) return pool
def analyze_hp(filename, segment_duration=20): lowlevelFrameSize = 2048 lowlevelHopSize = 1024 tonalFrameSize = 4096 tonalHopSize = 1024 # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) window = es.Windowing(type='blackmanharris62') fft = es.FFT() stft = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=lowlevelFrameSize, hopSize=lowlevelHopSize): stft.append(fft(window(frame))) # Librosa requires bins x frames format stft = np.array(stft).T D_harmonic, D_percussive = librosa.decompose.hpss(stft, margin=8) D_percussive_magnitude, _ = librosa.magphase(D_percussive) D_harmonic_magnitude, _ = librosa.magphase(D_harmonic) # Convert back to Essentia format (frames x bins) spectrum_harmonic = D_harmonic_magnitude.T specturm_percussive = D_percussive_magnitude.T # Processing for Mel bands melbands = es.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11025) # Normalize Mel bands: log10(1+x*10000) norm = es.UnaryOperator(type='identity', shift=1, scale=10000) log10 = es.UnaryOperator(type='log10') p = essentia.Pool() for spectrum_frame in spectrum_harmonic: p.add('melbands_harmonic', log10(norm(melbands(spectrum_frame)))) for spectrum_frame in specturm_percussive: p.add('melbands_percussive', log10(norm(melbands(spectrum_frame)))) return p
def essentia_midi(file): pool = essentia.Pool() # Compute all features, aggregate only 'mean' and 'stdev' statistics for all low-level, rhythm and tonal frame features features, features_frames = es.MusicExtractor( lowlevelStats=['mean', 'stdev'], rhythmStats=['mean', 'stdev'], tonalStats=['mean', 'stdev'])(file) # You can then access particular values in the pools: print("Filename:", features['metadata.tags.file_name']) print("-" * 80) print("Replay gain:", features['metadata.audio_properties.replay_gain']) print("EBU128 integrated loudness:", features['lowlevel.loudness_ebu128.integrated']) print("EBU128 loudness range:", features['lowlevel.loudness_ebu128.loudness_range']) print("-" * 80) print("MFCC mean:", features['lowlevel.mfcc.mean']) print("-" * 80) print("BPM:", features['rhythm.bpm']) print("Beat positions (sec.)", features['rhythm.beats_position']) print("-" * 80) print( "Key/scale estimation (using a profile specifically suited for electronic music):", features['tonal.key_edma.key'], features['tonal.key_edma.scale']) # BPM Detection # Loading audio file audio = MonoLoader(filename=file)() # # Compute beat positions and BPM rhythm_extractor = RhythmExtractor2013(method="multifeature") bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor(audio) beat_volume_extractor = BeatsLoudness(beats=beats) beats_loudness, beats_loudness_band_ratio = beat_volume_extractor(audio) # Danceability Detection danceability_extractor = Danceability() danceability, dfa = danceability_extractor(audio) # Melody Detection # Load audio file; it is recommended to apply equal-loudness filter for PredominantPitchMelodia loader = EqloudLoader(filename=file, sampleRate=44100) audio = loader() print("Duration of the audio sample [sec]:") print(len(audio) / 44100.0) pitch_extractor = PredominantPitchMelodia(frameSize=2048, hopSize=1024) pitch_values, pitch_confidence = pitch_extractor(audio) midi_extractor = PitchContourSegmentation(hopSize=1024) onset, duration, midi_pitch = midi_extractor(pitch_values, audio) # Pitch is estimated on frames. Compute frame time positions pitch_times = numpy.linspace(0.0, len(audio) / 44100.0, len(pitch_values)) #Storing in Pool pool.add('MIDIonset', onset) pool.add('MIDIduration', duration) pool.add('MIDIpitch', midi_pitch) pool.add('pitch', pitch_values) pool.add('danceability', danceability) pool.add('beat-loudness', beats_loudness) pool.add('beats', beats) pool.add('bpm', bpm) output = YamlOutput( filename='./analyzer/output.json', format='json', indent=4, writeVersion=False) # use "format = 'json'" for JSON output output(pool)
def pickleToPool(nparr): pool = essentia.Pool() for tup in nparr: pool.add(tup[0], tup[1]) return pool
def cleaningSineTracks(self, pool, minFrames): """ Cleans the sine tracks identified based on the minimum number of frames identified reference: https://github.com/MTG/essentia/blob/b5b46f80d80058603a525af36cbf7069c17c3df9/ test/src/unittests/synthesis/test_sinemodel_streaming.py :param pool: must contain pool["magnitudes"], pool["frequencies"] and pool["phases"] :param minFrames: minimum number of frames required for a sine track to be valid :return: cleaned up pool """ freqsTotal = pool["frequencies"] nFrames = freqsTotal.shape[0] begTrack = 0 freqsClean = freqsTotal.copy() if (nFrames > 0): f = 0 nTracks = freqsTotal.shape[ 1] # we assume all frames have a fix number of tracks for t in range(nTracks): f = 0 begTrack = f while (f < nFrames - 1): # // check if f is begin of track if (freqsClean[f][t] <= 0 and freqsClean[f + 1][t] > 0): begTrack = f + 1 # clean track if shorter than min duration if ((freqsClean[f][t] > 0 and freqsClean[f + 1][t] <= 0) and ((f - begTrack) < minFrames)): for i in range(begTrack, f + 1): freqsClean[i][t] = 0 f += 1 cleaned_pool = essentia.Pool() for frame_ix, originalTracks in enumerate(freqsTotal): freqs = [] mags = [] phases = [] for track_ix, freqTrack in enumerate(originalTracks): if freqTrack in freqsClean[frame_ix]: freqs.append(pool["frequencies"][frame_ix][track_ix]) mags.append(pool["magnitudes"][frame_ix][track_ix]) phases.append(pool["phases"][frame_ix][track_ix]) else: freqs.append(0) mags.append(0) phases.append(0) cleaned_pool.add("frequencies", essentia.array(freqs)) cleaned_pool.add("magnitudes", essentia.array(mags)) cleaned_pool.add("phases", essentia.array(phases)) return cleaned_pool
def compute_features(complete_path): result = [] meta_result = [] file_count = 0 # for loop over files for file in os.listdir(complete_path): if file.endswith(".wav"): file_count+=1 # print(file +' : ' + str(file_count)) # load our audio into an array audio = es.MonoLoader(filename=complete_path + file, sampleRate=44100)() # create the pool and the necessary algorithms pool = essentia.Pool() window = es.Windowing() energy = es.Energy() spectrum = es.Spectrum() centroid = es.Centroid(range=22050) rolloff = es.RollOff() crest = es.Crest() speak = es.StrongPeak() rmse = es.RMS() mfcc = es.MFCC() flux = es.Flux() barkbands = es.BarkBands( sampleRate = 44100) zerocrossingrate = es.ZeroCrossingRate() meta = es.MetadataReader(filename=complete_path + file, failOnError=True)() pool_meta, duration, bitrate, samplerate, channels = meta[7:] # centralmoments = es.SpectralCentralMoments() # distributionshape = es.DistributionShape() # compute the centroid for all frames in our audio and add it to the pool for frame in es.FrameGenerator(audio, frameSize = 1024, hopSize = 512): frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) c = centroid(frame_spectrum) pool.add('spectral.centroid', c) cr = crest(frame_spectrum) pool.add('spectral crest', cr) r = rolloff(frame_spectrum) pool.add('spectral rolloff', r) sp = speak(frame_spectrum) pool.add('strong peak', sp) rms = rmse(frame_spectrum) pool.add('RMS', rms) pool.add('spectral_energy', energy(frame_spectrum)) # (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) # pool.add('frame_MFCC', frame_mfcc) fl = flux(frame_spectrum) pool.add('spectral flux', fl) # bbands = barkbands(frame_spectrum) # pool.add('bark bands', bbands) zcr = zerocrossingrate(frame_spectrum) pool.add('zero crossing rate', zcr) # frame_centralmoments = centralmoments(power_spectrum) # (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) # pool.add('spectral_kurtosis', frame_kurtosis) # pool.add('spectral_spread', frame_spread) # pool.add('spectral_skewness', frame_skewness) # aggregate the results (find mean if needed) aggrpool = es.PoolAggregator(defaultStats = ['mean'])(pool) #,'stdev' ])(pool) pool_meta.set("duration", duration) pool_meta.set("filename", os.path.relpath(file)) # write pools to lists pool_arr = pool_to_array(aggrpool) result.append(pool_arr) meta_arr = pool_to_array(pool_meta) meta_result.append(meta_arr) features_df = pd.DataFrame.from_records(result) features_df.columns = ['centroid', 'crest','roll off','strong peak','rms','energy','flux','zcr'] meta_df = pd.DataFrame.from_records(meta_result) meta_df.columns = ['duration','filename','metadata.tags.comment'] del meta_df['metadata.tags.comment'] return features_df,meta_df
def analysis_synthesis_spr_model_standard(self, params, signal): pool = essentia.Pool() # Streaming Algos for Sine Model Analysis w = es.Windowing(type="hann") fft = es.FFT(size=params['fftSize']) smanal = es.SineModelAnal( sampleRate=params['sampleRate'], maxnSines=params['maxnSines'], magnitudeThreshold=params['magnitudeThreshold'], freqDevOffset=params['freqDevOffset'], freqDevSlope=params['freqDevSlope']) # Standard Algos for Sine Model Analysis smsyn = es.SineModelSynth(sampleRate=params['sampleRate'], fftSize=params['frameSize'], hopSize=params['hopSize']) ifft = es.IFFT(size=params['frameSize']) overlSine = es.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize'], gain=1. / params['frameSize']) overlres = es.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize'], gain=1. / params['frameSize']) fft_original = [] # analysis for frame in es.FrameGenerator(signal, frameSize=params["frameSize"], hopSize=params["hopSize"]): frame_fft = fft(w(frame)) fft_original.append(frame_fft) freqs, mags, phases = smanal(frame_fft) pool.add("frequencies", freqs) pool.add("magnitudes", mags) pool.add("phases", phases) # remove short tracks minFrames = int(params['minSineDur'] * params['sampleRate'] / params['hopSize']) pool = self.cleaningSineTracks(pool, minFrames) # synthesis sineTracksAudio = np.array([]) resTracksAudio = np.array([]) for frame_ix, _ in enumerate(pool["frequencies"]): sine_frame_fft = smsyn(pool["magnitudes"][frame_ix], pool["frequencies"][frame_ix], pool["phases"][frame_ix]) res_frame_fft = fft_original[frame_ix] - sine_frame_fft sine_outframe = overlSine(ifft(sine_frame_fft)) sineTracksAudio = np.append(sineTracksAudio, sine_outframe) res_outframe = overlres(ifft(res_frame_fft)) resTracksAudio = np.append(resTracksAudio, res_outframe) sineTracksAudio = sineTracksAudio.flatten()[-len(signal):] resTracksAudio = resTracksAudio.flatten()[-len(signal):] #print("len signal", len(signal), "len res", len(resTracksAudio)) return essentia.array(signal), essentia.array( sineTracksAudio), essentia.array(resTracksAudio)
def extractFeatures(audio_data): """ Recebe um vetor de reais representando um sinal de áudio, calcula suas features, agrega-as em uma Pool() de essentia e retorna esta Pool """ from numpy import ndarray assert (type(audio_data) is ndarray) assert ("float" in str(audio_data.dtype)) #Inicia Pool() output_pool = es.Pool() #Calcula espectro do sinal output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data)) #Calcula EnergyBandRatio energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum]) output_pool.set(pk_energy_band_ratio, energy_band_ratio) #Calcula MaxMagFreq max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum]) output_pool.set(pk_max_mag_freq, max_mag_freq) #Calcula SpectralCentroidTime spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data) output_pool.set(pk_spectral_centroid_time, spectral_centroid_time) #Calcula SpectralComplexity spectral_complexity = es_mode.SpectralComplexity()( output_pool[pk_spectrum]) output_pool.set(pk_spectral_complexity, spectral_complexity) #Calcula StrongPeak strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum]) output_pool.set(pk_strong_peak, strong_peak) #Calcula SpectralPeaks sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum]) #corta o DC, se houver, e pedido de HarmonicPeaks if sp_freq[0] == 0: sp_freq = sp_freq[1:] sp_mag = sp_mag[1:] output_pool.set(pk_spectral_peaks_freq, sp_freq) output_pool.set(pk_spectral_peaks_mag, sp_mag) ###################################### # Para Inharmonicity # ###################################### #Calcula PitchYinFFT pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()( output_pool[pk_spectrum]) output_pool.set(pk_pitch, pitch_yin_fft) output_pool.set(pk_pitch_prob, pitch_prob_yin_fft) #Calcula HarmonicPeaks hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag],\ output_pool[pk_pitch] ) output_pool.set(pk_harmonic_peaks_freq, hp_freq) output_pool.set(pk_harmonic_peaks_mag, hp_mag) #Calcula Inharmonicity inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\ output_pool[pk_harmonic_peaks_mag]) output_pool.set(pk_inharmonicity, inharmonicity) #Acaba Inharmonicity##################################### #Calcula SpectralContrast frame_size = 2 * (output_pool[pk_spectrum].size - 1) spectral_contrast, spectral_valley = \ es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum]) output_pool.set(pk_spectral_contrast, spectral_contrast) output_pool.set(pk_spectral_valley, spectral_valley) #Calcula SpectralWhitening spectral_whitening = \ es_mode.SpectralWhitening()(output_pool[pk_spectrum],\ output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag]) output_pool.set(pk_spectral_whitening, spectral_whitening) return output_pool
def main(): aparser = argparse.ArgumentParser() aparser.add_argument( '-c', action='store', dest='config', help= '-c type of the dataset. For ex: _1s_h100 for 1s with full length hop') aparser.add_argument('-t', action='store', dest='data_type', help='-t type of data original/harmonic/residual') args = aparser.parse_args() if not args.config: aparser.error('Please specify the data config!') conf = args.config if args.data_type == 'original': path_to_dataset = PATH_TO_ORIGINAL_WAV_FILES + conf path_to_features = PATH_TO_ORIGINAL_FEATURES + conf if args.data_type == 'residual': path_to_dataset = PATH_TO_RESIDUAL_WAV_FILES + conf path_to_features = PATH_TO_RESIDUAL_FEATURES + conf if args.data_type == 'harmonic': path_to_dataset = PATH_TO_HARMONIC_WAV_FILES + conf path_to_features = PATH_TO_HARMONIC_FEATURES + conf datasets = sorted(os.listdir(path_to_dataset)) for dataset in datasets: empty_files = 0 print("[Dataset] : " + dataset) folder_path = os.path.join(path_to_dataset, dataset) lrms = sorted(os.listdir(folder_path)) for channel in lrms: channel_path = os.path.join(folder_path, channel) sub_folders = sorted(os.listdir(channel_path)) for sub_folder in sub_folders: sub_folder_path = os.path.join(channel_path, sub_folder) files = sorted(os.listdir(sub_folder_path)) for filename in files: filepath = os.path.join(sub_folder_path, filename) features = essentia.Pool() try: # Compute all features, aggregate only 'mean' and 'stdev' statistics for all low-level, rhythm and tonal frame features features, features_frames = es.MusicExtractor( lowlevelSilentFrames='drop', lowlevelFrameSize=2048, lowlevelHopSize=1024, lowlevelStats=['mean', 'stdev'])(filepath) features_frames = [] except RuntimeError, e: print(filepath + " is almost silent") empty_files += 1 dump_path = os.path.join(path_to_features, dataset, channel, sub_folder) create_folder(dump_path) es.YamlOutput(filename=os.path.join( dump_path, filename.replace('.wav', '.json')), format='json')(features) features = [] filename = [] print("Feature Extraction Completed Successfully for " + dataset) print("Total number of empty file in " + dataset + " is " + str(empty_files))
def compute(profile, inputFilename, outputFilename, userOptions = {}): # load profile profileDirectory = __file__.split(os.path.sep)[:-1] profileDirectory.append('profiles') profileDirectory.append('%s_config.yaml' % profile) try: # try to load the predefined profile, if it exists config = open(os.path.sep.join(profileDirectory), 'r').read() except: # otherwise, just load the file that was specified config = open(profile, 'r').read() options = yaml.load(config) mergeRecursiveDict(options, userOptions) # which format for the output? format = options['outputFormat'] if format not in [ 'xml', 'yaml' ]: raise essentia.EssentiaError('output format should be either \'xml\' or \'yaml\'') if format == 'xml': xmlOutput = True else: xmlOutput = False # we need this for dependencies checking options['computed'] = [] options['generatedBy'] = {} # get list of extractors to compute extractors = options['extractors'] # create pool & megalopool pool = essentia.Pool() # load audio file into memory audio = loadAudioFile(inputFilename, pool, options) # preprocess audio by applying a DC filter, normalization, etc... # preprocessing is a special step because it modifies the audio, hence it # must be executed before all the other extractors audio = preProcess(audio, pool, options, 'metadata') options['globalPreprocessing'] = options['preprocessing'] del options['preprocessing'] # process all extractors computeAllExtractors(extractors, audio, pool, options) # process segmentation if asked if options['segmentation']['doSegmentation']: segments = segmentation.compute(inputFilename, audio, pool, options) # remove unwanted descriptors wantedStats = cleanStats(pool, options) # add to megalopool #megalopool = essentia.Pool() scope = [ 0.0, len(audio)/options['sampleRate'] ] #megalopool.add('global', pool.aggregate_descriptors(wantedStats))#, scope) megalopool = essentia.PoolAggregator(exceptions=wantedStats)(pool) # special case for spectral contrast, which is only 1 matrix, therefore no # stats are computed: spectral_contrast_stats(megalopool, 'lowlevel.spectral_contrast', wantedStats) # plotting descriptors evolution try: if options['plots']: import plotting plotting.compute(inputFilename, audio, pool, options) except KeyError: pass # compute extractors on segments if options['segmentation']['doSegmentation']: if options['segmentation']['computeSegments']: if len(segments) == 0: megalopool.add('void', [0]) else: computeSegments(audio, segments, extractors, megalopool, options) # save to output file essentia.YamlOutput(filename=outputFilename)(megalopool)
plt.pcolormesh(np.array(mfccs)) plt.show() """ """ # and let's do it in a more essentia-like way: mfccs = [] for frame in ess.FrameGenerator(audio, frameSize = 1024, hopSize = 512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape mfccs = ess.array(mfccs).T """ # So let's redo the previous using a Pool pool = es.Pool() for frame in ess.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) """ plotMfcc = pool['lowlevel.mfcc'].T[1:,:] plt.pcolormesh(plotMfcc) plt.show() """ #output = es.YamlOutput(filename = 'mfcc.sig') output = ess.YamlOutput(filename='joeTestOut/mfcc.json', format='json') output(pool) # Say we're not interested in all the MFCC frames, but just their mean & variance.
import os import essentia as e import essentia.streaming as estr # CONFIGURATION # ================================================================================ # Default parameters sample_rate = 44100 window_size = 16384 hop_size = 8192 tuning_frequency = 440 # retrieve filenames from folder: soundfiles = os.listdir(audio_folder) if '.DS_Store' in soundfiles: soundfiles.remove('.DS_Store') # ANALYSIS # ================================================================================ print "\nANALYSIS..." for item in soundfiles: loader = estr.MonoLoader(filename=audio_folder+'/'+item,sampleRate=sample_rate) tuningExtractor = <estr.TuningFrequencyExtractor(frameSize=window_size,hopSize=hop_size) pool = e.Pool() loader.audio >> tuningExtractor.signal tuningExtractor.tuningFrequency >> (pool, 'tuning_reference') # run and print the results. e.run(loader) result = pool['tuning_reference'] print item[:20]+'... ', result
def extractFeatures( self, audio, scale="onsets", listOfFeatures=['Loudness', 'Centroid', 'Flatness', 'BFCC']): """Extract features from an audio vector. This tends to be pretty slow for onset based segmentation and retrieval :param audio: the audio to extract features from :param scale: the temporal scale we wish to use :return: features: the list of audio features units: If FFT scale, then the fft frames also """ pool = essentia.Pool() medianPool = essentia.Pool() centroid = flatness = loudness = pitchYinFFT = None mfcc = bfcc = gfcc = spectralPeaks = hpcp = None if 'Centroid' in listOfFeatures: centroid = essentia.standard.Centroid(range=self.sampleRate / 2) if 'Flatness' in listOfFeatures: flatness = essentia.standard.Flatness() if 'Loudness' in listOfFeatures: loudness = essentia.standard.Loudness() if 'Pitch' in listOfFeatures: pitchYinFFT = essentia.standard.PitchYinFFT() if 'MFCC' in listOfFeatures: mfcc = essentia.standard.MFCC(inputSize=int(self.frameSize / 2 + 1)) if 'BFCC' in listOfFeatures: bfcc = essentia.standard.BFCC(inputSize=int(self.frameSize / 2 + 1)) if 'GFCC' in listOfFeatures: gfcc = essentia.standard.GFCC(inputSize=int(self.frameSize / 2 + 1)) if 'HPCP' in listOfFeatures: spectralPeaks = essentia.standard.SpectralPeaks( orderBy="magnitude", magnitudeThreshold=1e-05, minFrequency=40, maxFrequency=5000, maxPeaks=10000) hpcp = essentia.standard.HPCP() fft = essentia.standard.FFT() magnitude = essentia.standard.Magnitude() w = essentia.standard.Windowing(type='blackmanharris62') features = [] units = [] f = [] # #Manual framecutting is faster than Essentia in Python # for fstart in range(0, len(audio) - self.frameSize, self.hopSize): # #Get the frame # frame = audio[fstart:fstart + self.frameSize] for frame in essentia.standard.FrameGenerator(audio, frameSize=self.frameSize, hopSize=self.hopSize): #FFT and Magnitude Spectrum fft_frame = fft(w(frame)) mag = magnitude(fft_frame) if centroid is not None: centroidScalar = centroid(mag) pool.add("Centroid", centroidScalar) if flatness is not None: flatnessScalar = flatness(mag) pool.add("Flatness", flatnessScalar) if loudness is not None: loudnessScalar = loudness(frame) pool.add("Loudness", loudnessScalar) if pitchYinFFT is not None: pitchScalar, pitchConfidenceScalar = pitchYinFFT(mag) # pool.add("pitch", pitchScalar) medianPool.add("Pitch", pitchScalar) import time startTime = time.time() if mfcc is not None: mfcc_bands, mfccVector = mfcc(mag) pool.add("MFCC", mfccVector[1:]) if bfcc is not None: bfcc_bands, bfccVector = bfcc(mag) pool.add("BFCC", bfccVector[1:]) if gfcc is not None: gfcc_bands, gfccVector = gfcc(mag) pool.add("GFCC", gfccVector[1:]) if hpcp is not None: frequencies, magnitudes = spectralPeaks(mag) hpcpVector = hpcp(frequencies, magnitudes) pool.add("HPCP", hpcpVector) f.append(hpcpVector) elapsedTime = time.time() - startTime x = pool.descriptorNames() #If we are spectral based we need to return the fft frames as units and the framewise features if scale is "spectral": units.append(fft_frame) frameFeatures = [] """ We do it this roundabout way to retain the order that user wants in listOfFeatures """ for feature in listOfFeatures: for descriptor in pool.descriptorNames(): if feature in descriptor: frameFeatures = np.append(frameFeatures, (pool[descriptor])) for descriptor in medianPool.descriptorNames(): if feature in descriptor: frameFeatures = np.append(frameFeatures, (medianPool[descriptor])) features.append(frameFeatures) pool.clear() medianPool.clear() #Now we get all the stuff out of the pool if scale is not "spectral": # aggrPool = essentia.standard.PoolAggregator(defaultStats=['mean', 'var'])(pool) aggrPool = essentia.standard.PoolAggregator( defaultStats=['mean'])(pool) medianAggrPool = essentia.standard.PoolAggregator( defaultStats=['median'])(medianPool) """ We do it this roundabout way to retain the order that user wants in listOfFeatures """ for feature in listOfFeatures: for aggrFeature in aggrPool.descriptorNames(): if feature in aggrFeature: if "mean" or "variance" in feature: features = np.append(features, aggrPool[aggrFeature]) else: features += aggrPool[aggrFeature][0] #Median based features (i.e. pitch) for medianFeature in medianAggrPool.descriptorNames(): if feature in medianFeature: if "median" in medianFeature: features = np.append(features, medianAggrPool[medianFeature]) else: features += medianAggrPool[medianFeature][0] aggrPool.merge(medianAggrPool) #Return features, and if it's spectral return the frames as units return features, units, pool
def load_audio(type='mono'): raw_audio = OrderedDict() stem_audio = OrderedDict() if 'mono' in type: # loads raw audio loader = MonoLoader() for name in gNameTracks: path = gRawPath[name] loader.configure(filename=path) pool = essentia.Pool() loader.audio >> (pool, 'loader.audio') essentia.run(loader) print 'Raw track contains %d samples of Audio' % len( pool['loader.audio']) raw_audio[name] = pool['loader.audio'] essentia.reset(loader) # loads stem audio for name in gNameTracks: path = gStemPath[name] loader.configure(filename=path) pool = essentia.Pool() loader.audio >> (pool, 'loader.audio') essentia.run(loader) print 'Stem track contains %d samples of Audio' % len( pool['loader.audio']) stem_audio[name] = pool['loader.audio'] essentia.reset(loader) elif 'stereo' in type: # loads raw audio Stereo: for name in gNameTracks: path = gRawPath[name] loader = AudioLoader(filename=path) pool = essentia.Pool() loader.audio >> (pool, 'loader.audio') loader.sampleRate >> None loader.numberChannels >> None loader.md5 >> None loader.bit_rate >> None loader.codec >> None essentia.run(loader) print 'Raw Stereo track contains %d samples of Audio' % len( pool['loader.audio']) raw_audio[name] = pool['loader.audio'] essentia.reset(loader) # loads stem stereo for name in gNameTracks: path = gStemStereoPath[name] loader = AudioLoader(filename=path) pool = essentia.Pool() loader.audio >> (pool, 'loader.audio') loader.sampleRate >> None loader.numberChannels >> None loader.md5 >> None loader.bit_rate >> None loader.codec >> None essentia.run(loader) print 'Stem Stereo track contains %d samples of Audio' % len( pool['loader.audio']) stem_audio[name] = pool['loader.audio'] essentia.reset(loader) return raw_audio, stem_audio
def analyseFile( self, file, writeOnsets, scale="onsets", yamlOutputFile="", onsetDetection="", listOfFeatures=['Loudness', 'Centroid', 'Flatness', 'MFCC']): """Extract onsets from a single file then extract features from all those onsets :param file: the file to analyse :param writeOnsets: whether you want to write the audio onsets to the filesystem :param scale: the temporal scale: None, spectral, onsets, beats :return: features : lists of lists of features units : list of audio signals corresponding to units unitTimes: the list of transient times from the audio signals """ onsetTimes = [] onsets = [] fileName = file filePool = essentia.Pool() print("Processing file: " + file) if enableDebug: self.debugFile.write(file + "\n") #Extract onsets or add the audio as a single onset print(" Onset Detection and Segmentation...") if scale == "beats": onsetTimes, onsets, fileName = self.extractBeats(file) elif scale == "onsets": onsetTimes, onsets, fileName = self.extractAndSliceOnsets( file, method=onsetDetection) else: onsetTimes.append(0.0) audio = self.loadAudio(file) onsets.append(audio) #Optionally write these onsets out if (writeOnsets): fileNames = self.writeOnsets(onsets, file) features = [] units = [] print(" Feature Extraction...") for onsetTime, onset in zip(onsetTimes, onsets): onsetFeatures, onsetFFTs, onsetPool = self.extractFeatures( onset, scale, listOfFeatures=listOfFeatures) #If it's not onset based then spectra are the units, append if scale is "spectral": units += onsetFFTs features += onsetFeatures else: features.append(onsetFeatures) onsetPool.add("onsetTimes", onsetTime) filePool.merge(onsetPool, "append") if scale is not "spectral": units = onsets if yamlOutputFile != "": essentia.standard.YamlOutput(filename=yamlOutputFile)(filePool) return features, units, onsetTimes
def analyze(filename, segment_duration=20): lowlevelFrameSize = 2048 lowlevelHopSize = 1024 tonalFrameSize = 4096 tonalHopSize = 1024 # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') # TODO # There's a bug in streaming mode Python wrapper: running both Mel and HPCP # in the same network with the same loader will result in a memory error. # This does not happen in C++. As a workaround, compute Mel and HPCP in # two separate networks with two separate loaders. loader_mel = EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) loader_hpcp = EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) # Processing for Mel bands framecutter_mel = FrameCutter(frameSize=lowlevelFrameSize, hopSize=lowlevelHopSize) window_mel = Windowing(type='blackmanharris62') spectrum_mel = Spectrum() melbands = MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11025) # Processing for HPCPs framecutter_hpcp = FrameCutter(frameSize=tonalFrameSize, hopSize=tonalHopSize) window_hpcp = Windowing(type='blackmanharris62') spectrum_hpcp = Spectrum() speaks = SpectralPeaks(maxPeaks=60, magnitudeThreshold=0.00001, minFrequency=20.0, maxFrequency=3500.0, orderBy='magnitude') # Normalize Mel bands: log10(1+x*10000) norm = UnaryOperator(type='identity', shift=1, scale=10000) log10 = UnaryOperator(type='log10') hpcp = HPCP(size=12, bandPreset=False, minFrequency=20.0, maxFrequency=3500.0, weightType='cosine', windowSize=1.) p = essentia.Pool() loader_mel.audio >> framecutter_mel.signal framecutter_mel.frame >> window_mel.frame >> spectrum_mel.frame spectrum_mel.spectrum >> melbands.spectrum melbands.bands >> norm.array >> log10.array >> (p, 'melbands') essentia.run(loader_mel) loader_hpcp.audio >> framecutter_hpcp.signal framecutter_hpcp.frame >> window_hpcp.frame >> spectrum_hpcp.frame spectrum_hpcp.spectrum >> speaks.spectrum speaks.frequencies >> hpcp.frequencies speaks.magnitudes >> hpcp.magnitudes hpcp.hpcp >> (p, 'hpcp') essentia.run(loader_hpcp) return p
def analsynthHarmonicModelStreaming(params, signal): out = array([0.]) pool = essentia.Pool() # windowing and FFT fcut = es.FrameCutter(frameSize=params['frameSize'], hopSize=params['hopSize'], startFromZero=False) w = es.Windowing(type="blackmanharris92") fft = es.FFT(size=params['frameSize']) spec = es.Spectrum(size=params['frameSize']) # pitch detection pitchDetect = es.PitchYinFFT(frameSize=params['frameSize'], sampleRate=params['sampleRate']) smanal = es.HarmonicModelAnal( sampleRate=params['sampleRate'], maxnSines=params['maxnSines'], magnitudeThreshold=params['magnitudeThreshold'], freqDevOffset=params['freqDevOffset'], freqDevSlope=params['freqDevSlope'], minFrequency=params['minFrequency'], maxFrequency=params['maxFrequency']) smsyn = es.SineModelSynth(sampleRate=params['sampleRate'], fftSize=params['frameSize'], hopSize=params['hopSize']) ifft = es.IFFT(size=params['frameSize']) overl = es.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize']) # add half window of zeros to input signal to reach same ooutput length signal = numpy.append(signal, zeros(params['frameSize'] // 2)) insignal = VectorInput(signal) # analysis insignal.data >> fcut.signal fcut.frame >> w.frame w.frame >> spec.frame w.frame >> fft.frame spec.spectrum >> pitchDetect.spectrum fft.fft >> smanal.fft pitchDetect.pitch >> smanal.pitch pitchDetect.pitchConfidence >> (pool, 'pitchConfidence') smanal.magnitudes >> (pool, 'magnitudes') smanal.frequencies >> (pool, 'frequencies') smanal.phases >> (pool, 'phases') # synthesis smanal.magnitudes >> smsyn.magnitudes smanal.frequencies >> smsyn.frequencies smanal.phases >> smsyn.phases smsyn.fft >> ifft.fft ifft.frame >> overl.frame overl.signal >> (pool, 'audio') essentia.run(insignal) # remove short tracks freqs = pool['frequencies'] minFrames = int(params['minSineDur'] * params['sampleRate'] / params['hopSize']) freqsClean = cleaningSineTracks(freqs, minFrames) pool['frequencies'].data = freqsClean # remove first half window frames outaudio = pool['audio'] outaudio = outaudio[2 * params['hopSize']:] return outaudio, pool
def analyze_mel(filename, segment_duration=None, maxFrequency=11025, replaygain=True): lowlevelFrameSize = 2048 lowlevelHopSize = 1024 # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() if replaygain: replaygain = es.ReplayGain()(audio) else: replaygain = -6 # Default replaygain value in EasyLoader if segment_duration: segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration else: segment_start = 0 segment_end = len(audio) / 44100 if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader_mel = EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) # Processing for Mel bands framecutter_mel = FrameCutter(frameSize=lowlevelFrameSize, hopSize=lowlevelHopSize) window_mel = Windowing(type='blackmanharris62', zeroPadding=lowlevelFrameSize) spectrum_mel = Spectrum() melbands128 = MelBands(numberBands=128, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands96 = MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands48 = MelBands(numberBands=48, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands32 = MelBands(numberBands=32, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands24 = MelBands(numberBands=24, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands16 = MelBands(numberBands=16, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) melbands8 = MelBands(numberBands=8, lowFrequencyBound=0, highFrequencyBound=maxFrequency, inputSize=lowlevelFrameSize + 1) # Normalize Mel bands: log10(1+x*10000) norm128 = UnaryOperator(type='identity', shift=1, scale=10000) log10128 = UnaryOperator(type='log10') norm96 = UnaryOperator(type='identity', shift=1, scale=10000) log1096 = UnaryOperator(type='log10') norm48 = UnaryOperator(type='identity', shift=1, scale=10000) log1048 = UnaryOperator(type='log10') norm32 = UnaryOperator(type='identity', shift=1, scale=10000) log1032 = UnaryOperator(type='log10') norm24 = UnaryOperator(type='identity', shift=1, scale=10000) log1024 = UnaryOperator(type='log10') norm16 = UnaryOperator(type='identity', shift=1, scale=10000) log1016 = UnaryOperator(type='log10') norm8 = UnaryOperator(type='identity', shift=1, scale=10000) log108 = UnaryOperator(type='log10') p = essentia.Pool() loader_mel.audio >> framecutter_mel.signal framecutter_mel.frame >> window_mel.frame >> spectrum_mel.frame spectrum_mel.spectrum >> melbands128.spectrum spectrum_mel.spectrum >> melbands96.spectrum spectrum_mel.spectrum >> melbands48.spectrum spectrum_mel.spectrum >> melbands32.spectrum spectrum_mel.spectrum >> melbands24.spectrum spectrum_mel.spectrum >> melbands16.spectrum spectrum_mel.spectrum >> melbands8.spectrum melbands128.bands >> norm128.array >> log10128.array >> (p, 'mel128') melbands96.bands >> norm96.array >> log1096.array >> (p, 'mel96') melbands48.bands >> norm48.array >> log1048.array >> (p, 'mel48') melbands32.bands >> norm32.array >> log1032.array >> (p, 'mel32') melbands24.bands >> norm24.array >> log1024.array >> (p, 'mel24') melbands16.bands >> norm16.array >> log1016.array >> (p, 'mel16') melbands8.bands >> norm8.array >> log108.array >> (p, 'mel8') essentia.run(loader_mel) return p
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ #help(ess.SpectralContrast) """ orig M = 1024 N = 1024 H = 512 fs = 44100 W = 'hann' """ """ freesound Real sampleRate = 44100; int frameSize = 2048; int hopSize = 1024; int zeroPadding = 0; string silentFrames ="noise"; string windowType = "blackmanharris62"; // Silence Rate Real thresholds_dB[] = { -20, -30, -60 }; vector<Real> thresholds(ARRAY_SIZE(thresholds_dB)); for (uint i=0; i<thresholds.size(); i++) { thresholds[i] = db2lin(thresholds_dB[i]/2.0); } """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #silentFrames = "noise" #thresholds_dB = np.array([ -20, -30, -60 ]) #thresholds = np.power (10.0, thresholds_dB / 20) #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pool = es.Pool() for frame in frames: mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) pool.add('lowlevel.dissonance', diss) pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) pool.add('sfx.inharmonicity', inharm) sc_coeffs, sc_valleys = spectral_contrast(mX) pool.add('lowlevel.spectral_contrast', sc_coeffs) c = centroid(mX) pool.add('lowlevel.spectral_centroid', c) lat = log_attack_time(frame) pool.add('sfx.logattacktime', lat) h = hfc(mX) pool.add('lowlevel.hfc', h) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) aggrPool = calc_Mean_Var(pool) features = makeFeatures(aggrPool) json.dump(features, open(outputJsonFile, 'w'))
def analsynthHpsModelStreaming(params, signal): out = array([0.]) pool = essentia.Pool() # windowing and FFT fcut = es.FrameCutter(frameSize=params['frameSize'], hopSize=params['hopSize'], startFromZero=False) w = es.Windowing(type="blackmanharris92") spec = es.Spectrum(size=params['frameSize']) # pitch detection pitchDetect = es.PitchYinFFT(frameSize=params['frameSize'], sampleRate=params['sampleRate']) smanal = es.HpsModelAnal(sampleRate=params['sampleRate'], hopSize=params['hopSize'], maxnSines=params['maxnSines'], magnitudeThreshold=params['magnitudeThreshold'], freqDevOffset=params['freqDevOffset'], freqDevSlope=params['freqDevSlope'], minFrequency=params['minFrequency'], maxFrequency=params['maxFrequency'], stocf=params['stocf']) synFFTSize = min(params['frameSize'] / 4, 4 * params['hopSize']) # make sure the FFT size is appropriate smsyn = es.SpsModelSynth(sampleRate=params['sampleRate'], fftSize=synFFTSize, hopSize=params['hopSize'], stocf=params['stocf']) # add half window of zeros to input signal to reach same ooutput length signal = numpy.append(signal, zeros(params['frameSize'] / 2)) insignal = VectorInput(signal) # analysis insignal.data >> fcut.signal fcut.frame >> w.frame w.frame >> spec.frame spec.spectrum >> pitchDetect.spectrum fcut.frame >> smanal.frame pitchDetect.pitch >> smanal.pitch pitchDetect.pitchConfidence >> (pool, 'pitchConfidence') pitchDetect.pitch >> (pool, 'pitch') # synthesis smanal.magnitudes >> smsyn.magnitudes smanal.frequencies >> smsyn.frequencies smanal.phases >> smsyn.phases smanal.stocenv >> smsyn.stocenv smsyn.frame >> (pool, 'frames') smsyn.sineframe >> (pool, 'sineframes') smsyn.stocframe >> (pool, 'stocframes') essentia.run(insignal) outaudio = framesToAudio(pool['frames']) outaudio = outaudio[2 * params['hopSize']:] return outaudio, pool
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) E = [] numFrames = 0 for frame in frames: numFrames += 1 E_frame = energy(frame) E.append(E_frame) E_max = np.max(E) frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pools = [(t, es.Pool()) for t in dscr.threshold] for frame in frames: eNorm = energy(frame) / E_max threshPools = [] for t, pool in pools: if eNorm >= t: threshPools.append(pool) mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools] #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools] pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) [pool.add('lowlevel.dissonance', diss) for pool in threshPools] pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) [pool.add('sfx.inharmonicity', inharm) for pool in threshPools] sc_coeffs, sc_valleys = spectral_contrast(mX) [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools] c = centroid(mX) [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools] lat = log_attack_time(frame) [pool.add('sfx.logattacktime', lat) for pool in threshPools] h = hfc(mX) [pool.add('lowlevel.hfc', h) for pool in threshPools] spec_complx = spectral_complexity(mX) [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools] #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean']) aggrPools = [calc_Mean_Var(pool) for t, pool in pools] features = {} [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))] json.dump(features, open(outputJsonFile, 'w'))
show() # <demo> --- stop --- # Introducing the Pool: a good-for-all container # # A Pool can contain any type of values (easy in Python, not as much in C++ :-) ) # They need to be given a name, which represent the full path to these values; # dot '.' characters are used as separators. You can think of it as a directory # tree, or as namespace(s) + local name. # # Examples of valid names are: bpm, lowlevel.mfcc, highlevel.genre.rock.probability, etc... # So let's redo the previous using a Pool pool = essentia.Pool() for frame in FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) imshow(pool['lowlevel.mfcc'].T[1:, :], aspect='auto') figure() # Let's plot mfcc bands on a log-scale so that the energy values will be better # differentiated by color from matplotlib.colors import LogNorm imshow(pool['lowlevel.mfcc_bands'].T, aspect='auto', interpolation='nearest', norm=LogNorm())
def compute_features(path, f_mfcc_kl, f_mfcc_euclid, f_notes, f_chroma, f_bh): gc.enable() # Loading audio file #will resample if sampleRate is different! try: audio = es.MonoLoader(filename=path, sampleRate=fs)() except: print("Erroneos File detected by essentia standard: skipping!") #return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl return 0, [], 0, 0, [], [], [], [], [], [] #will resample if sampleRate is different! try: loader = ess.MonoLoader(filename=path, sampleRate=44100) except: print("Erroneos File detected by essentia streaming: skipping!") #return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl return 0, [], 0, 0, [], [], [], [], [], [] #Initialize algorithms we will use frameSize = 4096 #512 hopSize = 2048 #256 ####################################### # DO FILTERING ONLY FOR MFCC - not with essentia standard # below is just an example #HP = es.HighPass(cutoffFrequency=128) #LP = es.LowPass(cutoffFrequency=4096) #lp_f = LP(audio) #hp_f = HP(lp_f) #audio = hp_f #MonoWriter(filename='music/filtered.wav')(filtered_audio) HP = ess.HighPass(cutoffFrequency=128) LP = ess.LowPass(cutoffFrequency=4096) #loader = ess.MonoLoader(filename=path, sampleRate=44100) #writer = ess.MonoWriter(filename='music/filtered.wav') #frameCutter = FrameCutter(frameSize = 1024, hopSize = 512) #pool = essentia.Pool() # Connect streaming algorithms #loader.audio >> HP.signal #HP.signal >> LP.signal #LP.signal >> writer.audio # Run streaming network #essentia.run(loader) bpm = 0 histogram = 0 key = 0 scale = 0 notes = 0 chroma_matrix = 0 mean = 0 cov = 0 var = 0 cov_kl = 0 ##################################### # extract mfcc ##################################### if f_mfcc_kl == 1 or f_mfcc_euclid == 1: #features, features_frames = es.MusicExtractor(analysisSampleRate=44100, mfccStats=['mean', 'cov'])(path) #m, n = features['lowlevel.mfcc.cov'].shape #print m #iu1 = np.triu_indices(m) #cov = features['lowlevel.mfcc.cov'][iu1] #mean = features['lowlevel.mfcc.mean'] #print(features['lowlevel.mfcc.cov']) hamming_window = es.Windowing(type='hamming') spectrum = es.Spectrum() # we just want the magnitude spectrum mfcc = es.MFCC(numberCoefficients=13) frame_sz = 2048 #512 hop_sz = 1024 #256 mfccs = np.array([ mfcc(spectrum(hamming_window(frame)))[1] for frame in es.FrameGenerator(audio, frameSize=frame_sz, hopSize=hop_sz) ]) #Let's scale the MFCCs such that each coefficient dimension has zero mean and unit variance: #mfccs = sklearn.preprocessing.scale(mfccs) #print mfccs.shape mean = np.mean(mfccs.T, axis=1) #print(mean) var = np.var(mfccs.T, axis=1) #print(var) cov = np.cov(mfccs.T) cov_kl = cov #.flatten() #get only upper triangular matrix values to shorten length iu1 = np.triu_indices(13) cov = cov[iu1] #plt.imshow(mfccs.T, origin='lower', aspect='auto', interpolation='nearest') #plt.ylabel('MFCC Coefficient Index') #plt.xlabel('Frame Index') #plt.colorbar() ##################################### # extract beat features and histogram ##################################### if f_bh == 1 or f_chroma == 1 or f_notes == 1: # Compute beat positions and BPM rhythm_extractor = es.RhythmExtractor2013(method="multifeature") bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor( audio) if f_bh == 1: peak1_bpm, peak1_weight, peak1_spread, peak2_bpm, peak2_weight, peak2_spread, histogram = es.BpmHistogramDescriptors( )(beats_intervals) tempo = bpm times = beats beats_frames = (beats * fs) / hopSize beats_frames = beats_frames.astype(int) #fig, ax = plt.subplots() #ax.bar(range(len(histogram)), histogram, width=1) #ax.set_xlabel('BPM') #ax.set_ylabel('Frequency') #plt.title("BPM histogram") #ax.set_xticks([20 * x + 0.5 for x in range(int(len(histogram) / 20))]) #ax.set_xticklabels([str(20 * x) for x in range(int(len(histogram) / 20))]) #plt.show() ##################################### # extract full beat aligned chroma ##################################### framecutter = ess.FrameCutter(frameSize=frameSize, hopSize=hopSize, silentFrames='noise') windowing = ess.Windowing(type='blackmanharris62') spectrum = ess.Spectrum() spectralpeaks = ess.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.00001, minFrequency=20, maxFrequency=3500, maxPeaks=60) # Use default HPCP parameters for plots, however we will need higher resolution # and custom parameters for better Key estimation hpcp = ess.HPCP() hpcp_key = ess.HPCP( size=36, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, minFrequency=20, maxFrequency=3500, weightType='cosine', nonLinear=False, windowSize=1.) key = ess.Key( profileType='edma', # Use profile for electronic music numHarmonics=4, pcpSize=36, slope=0.6, usePolyphony=True, useThreeChords=True) # Use pool to store data pool = essentia.Pool() # Connect streaming algorithms ################################### # USE FILTER - comment next lines in loader.audio >> HP.signal HP.signal >> LP.signal LP.signal >> framecutter.signal ################################### ################################### # NO FILTER - comment next line in #loader.audio >> framecutter.signal ################################### framecutter.frame >> windowing.frame >> spectrum.frame spectrum.spectrum >> spectralpeaks.spectrum spectralpeaks.magnitudes >> hpcp.magnitudes spectralpeaks.frequencies >> hpcp.frequencies spectralpeaks.magnitudes >> hpcp_key.magnitudes spectralpeaks.frequencies >> hpcp_key.frequencies hpcp_key.hpcp >> key.pcp hpcp.hpcp >> (pool, 'tonal.hpcp') key.key >> (pool, 'tonal.key_key') key.scale >> (pool, 'tonal.key_scale') key.strength >> (pool, 'tonal.key_strength') # Run streaming network essentia.run(loader) #print("Estimated key and scale:", pool['tonal.key_key'] + " " + pool['tonal.key_scale']) #print(pool['tonal.hpcp'].T) chroma = pool['tonal.hpcp'].T key = pool['tonal.key_key'] scale = pool['tonal.key_scale'] if f_chroma == 1: # Plot HPCP #imshow(pool['tonal.hpcp'].T, aspect='auto', origin='lower', interpolation='none') #plt.title("HPCPs in frames (the 0-th HPCP coefficient corresponds to A)") #show() #print beats_frames.shape[0] chroma_matrix = np.zeros((beats_frames.shape[0], 12)) prev_beat = 0 act_beat = 0 sum_key = np.zeros(12) chroma_align = chroma chroma_align = chroma_align.transpose() mat_index = 0 for i in beats_frames: act_beat = i value = sum( chroma_align[prev_beat:act_beat]) / (act_beat - prev_beat) chroma_align[prev_beat:act_beat] = value prev_beat = i if np.linalg.norm(value, ord=1) != 0: value = value / np.linalg.norm(value, ord=1) chroma_matrix[mat_index] = value mat_index = mat_index + 1 #chroma_align = chroma_align.transpose() #plt.figure(figsize=(10, 4)) #librosa.display.specshow(chroma_align, y_axis='chroma', x_axis='time') #plt.vlines(times, 0, 12, alpha=0.5, color='r', linestyle='--', label='Beats') #plt.colorbar() #plt.title('Chromagram') #plt.tight_layout() #chroma_align = chroma_align.transpose() #print(chroma_align[24:28]) ##################################### # extract full chroma text ##################################### if f_notes == 1: #print(chroma.shape) m, n = chroma.shape avg = 0 chroma = chroma.transpose() m, n = chroma.shape for j in chroma: avg = avg + np.sum(j) avg = avg / m threshold = avg / 2 for i in chroma: if np.sum(i) > threshold: ind = np.where(i == np.max(i)) max_val = i[ind] #is always 1! i[ind] = 0 ind2 = np.where(i == np.max(i)) i[ind] = 1 #if np.any(i[ind2][0] >= 0.8 * max_val): #i[ind2] = i[ind2] #pass #low_values_flags = i < 1 low_values_flags = i < 0.8 i[low_values_flags] = 0 else: i.fill(0) chroma = chroma.transpose() # Compute beat positions and BPM prev_beat = 0 act_beat = 0 sum_key = np.zeros(12) chroma = chroma.transpose() for i in beats_frames: act_beat = i sum_key = sum(chroma[prev_beat:act_beat]) #print(sum_key) #print(chroma[prev_beat:act_beat]) ind = np.where(sum_key == np.max(sum_key)) ind = ind[0] #print("debug") fill = np.zeros(len(j)) if (np.all(chroma[prev_beat:act_beat] == 0)): fill[ind] = 0 else: fill[ind] = 1 chroma[prev_beat:act_beat] = fill #print(chroma[prev_beat:act_beat]) prev_beat = i #print("BEAT") notes = [] for i in notes: del i prev_beat = 0 act_beat = 0 for i in beats_frames: act_beat = i sum_key = sum(chroma[prev_beat:act_beat]) ind = np.where(sum_key == np.max(sum_key)) prev_beat = i notes.append(ind[0][0]) prev_beat = i #chroma = chroma.transpose() #plt.figure(figsize=(10, 4)) #librosa.display.specshow(chroma, y_axis='chroma', x_axis='time') #plt.vlines(times, 0, 12, alpha=0.5, color='r', linestyle='--', label='Beats') #plt.colorbar() #plt.title('Chromagram') #plt.tight_layout() #chroma = chroma.transpose() gc.collect() return bpm, histogram, key, scale, notes, chroma_matrix, mean, cov, var, cov_kl