def get_onsets(self, _audio=[]): if _audio != []: audio = _audio else: audio = self.audio W = es.Windowing(type=self.winType) c2p = es.CartesianToPolar() fft = es.FFT() onsetDetection = es.OnsetDetection(method=self.onsetMethod, sampleRate=44100) onsets = es.Onsets(alpha=.2) # onsetIndex = [] pool = Pool() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(W(frame))) onsetDetection.configure(method=self.onsetMethod) onsetFunction = onsetDetection(mag, phase) pool.add("onsetFunction", onsetFunction) DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1]) return DetectedOnsetsArray
def get_key(file_in): """ Estimates the key and scale for an audio file. """ loader = streaming.MonoLoader(filename=file_in) framecutter = streaming.FrameCutter() windowing = streaming.Windowing(type="blackmanharris62") spectrum = streaming.Spectrum() spectralpeaks = streaming.SpectralPeaks(orderBy="magnitude", magnitudeThreshold=1e-05, minFrequency=40, maxFrequency=5000, maxPeaks=10000) pool = Pool() hpcp = streaming.HPCP() key = streaming.Key() loader.audio >> framecutter.signal framecutter.frame >> windowing.frame >> spectrum.frame spectrum.spectrum >> spectralpeaks.spectrum spectralpeaks.magnitudes >> hpcp.magnitudes spectralpeaks.frequencies >> hpcp.frequencies hpcp.hpcp >> key.pcp key.key >> (pool, 'tonal.key_key') key.scale >> (pool, 'tonal.key_scale') key.strength >> (pool, 'tonal.key_strength') run(loader) return Key(pool['tonal.key_key'], pool['tonal.key_scale'])
def compute_harmonic_magnitudes(contour_f0s, fftgram, idx_start, options): ''' Compute for each frame harm amplitude get harmonic partials form original spectrum Params: -------------------- fftgram - fftgram of whole audio file times - ts of whole audio hfreq - harmonics of contour magns - magns of contour ''' run_harm_model_anal = HarmonicModelAnal(nHarmonics=30) # TODO: sanity check: times == len(fftgram) and contour_start_time_SAL in times pool = Pool() for i, contour_f0 in enumerate(contour_f0s): if idx_start + i > len(fftgram) - 1: sys.exit('idx start is {} while len ffmtgram is {}'.format( idx_start, len(fftgram))) fft = fftgram[idx_start + i] # convert to freq : hfreq, magn, phase = run_harm_model_anal(fft, contour_f0) pool.add('phases', phase) pool.add('hfreqs', hfreq) pool.add('magns', magn) return pool['hfreqs'], pool['magns'], pool['phases']
def estimate_main_band(infile): """ Estimate if this is a low, mid, or high track. Not _really_ sure if this does what I need it to, but some quick tests looked right. """ loader = streaming.MonoLoader(filename=infile) framecutter = streaming.FrameCutter() windowing = streaming.Windowing(type="blackmanharris62") spectrum = streaming.Spectrum() freqbands = streaming.FrequencyBands(frequencyBands=[0, 250, 750, 4000]) pool = Pool() loader.audio >> framecutter.signal framecutter.frame >> windowing.frame >> spectrum.frame spectrum.spectrum >> freqbands.spectrum freqbands.bands >> (pool, 'bands') run(loader) sums = np.sum(pool['bands'], axis=0) band = np.argmax(sums) if band == 0: return 'low' elif band == 1: return 'mid' elif band == 2: return 'high'
def get_embeddings(melspecs: dict[str, np.ndarray], architectures: dict, predictors: dict) -> Optional[dict]: data = {} for architecture, metadata in architectures.items(): input_pool = Pool() input_pool.set('model/Placeholder', melspecs[metadata['essentia-algorithm']]) for dataset in metadata['datasets']: # TODO: chunk the input melspecs to avoid OOM error try: output_pool = predictors[f'{dataset}-{architecture}'](input_pool) except RuntimeError: return None for layer, layer_data in metadata['layers'].items(): embeddings = output_pool[layer_data['name']].squeeze() if len(embeddings) == 0: return None if len(embeddings.shape) == 1: embeddings = np.expand_dims(embeddings, axis=0) data[f'{dataset}-{architecture}-{layer}'] = embeddings return data
def pca(pool, namespace=''): llspace = 'lowlevel.' if namespace: llspace = namespace + '.lowlevel.' sccoeffs = pool[llspace + 'sccoeffs'] scvalleys = pool[llspace + 'scvalleys'] numFrames = len(sccoeffs) poolSc = Pool() merged = essentia.zeros(2 * len(sccoeffs[0])) for frame in xrange(numFrames): j = 0 for i in xrange(len(sccoeffs[frame])): merged[j] = sccoeffs[frame][i] merged[j + 1] = scvalleys[frame][i] j += 2 poolSc.add('contrast', merged) poolTransformed = standard.PCA(namespaceIn='contrast', namespaceOut='contrast')(poolSc) contrast = poolTransformed['contrast'] pool.set(llspace + 'spectral_contrast.mean', mean(contrast, axis=0)) pool.set(llspace + 'spectral_contrast.var', var(contrast, axis=0)) pool.remove(llspace + 'sccoeffs') pool.remove(llspace + 'scvalleys')
def computeBpmHistogram(self, noveltyCurve, frameSize=4, overlap=2, frameRate=44100. / 128., window='hann', zeroPadding=0, constantTempo=False, minBpm=30): pool = Pool() bpmHist = ess.BpmHistogram(frameRate=frameRate, frameSize=frameSize, overlap=overlap, zeroPadding=zeroPadding, constantTempo=constantTempo, windowType='hann', minBpm=minBpm) gen = ess.VectorInput(noveltyCurve) gen.data >> bpmHist.novelty bpmHist.bpm >> (pool, 'bpm') bpmHist.bpmCandidates >> (pool, 'bpmCandidates') bpmHist.bpmMagnitudes >> (pool, 'bpmMagnitudes') bpmHist.frameBpms >> None #(pool, 'frameBpms') bpmHist.tempogram >> (pool, 'tempogram') bpmHist.ticks >> (pool, 'ticks') bpmHist.ticksMagnitude >> (pool, 'ticksMagnitude') bpmHist.sinusoid >> (pool, 'sinusoid') essentia.run(gen) return pool
def get_bpm(file_in): pool = Pool() loader = streaming.MonoLoader(filename=file_in) bt = streaming.RhythmExtractor2013() bpm_histogram = streaming.BpmHistogramDescriptors() # BPM histogram output size is 250 centroid = streaming.Centroid(range=250) loader.audio >> bt.signal bt.bpm >> (pool, 'bpm') bt.ticks >> None bt.confidence >> (pool, 'confidence') bt.estimates >> None bt.bpmIntervals >> bpm_histogram.bpmIntervals bpm_histogram.firstPeakBPM >> (pool, 'bpm_first_peak') bpm_histogram.firstPeakWeight >> None bpm_histogram.firstPeakSpread >> None bpm_histogram.secondPeakBPM >> (pool, 'bpm_second_peak') bpm_histogram.secondPeakWeight >> None bpm_histogram.secondPeakSpread >> None bpm_histogram.histogram >> (pool, 'bpm_histogram') bpm_histogram.histogram >> centroid.array centroid.centroid >> (pool, 'bpm_centroid') run(loader) return pool['bpm']
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing( # pylint: disable-msg=E1101 zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum( # pylint: disable-msg=E1101 size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( # pylint: disable-msg=E1101 minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = \ estd.PitchSalienceFunction( # pylint: disable-msg=E1101 binResolution=self.bin_resolution) run_pitch_salience_function_peaks = \ estd.PitchSalienceFunctionPeaks( # pylint: disable-msg=E1101 binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( # pylint: disable-msg=E1101 hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator( audio, # pylint: disable-msg=E1101 frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( [f.tolist() for f in pool['allframes_salience_peaks_bins']], [f.tolist() for f in pool['allframes_salience_peaks_contourSaliences']]) return contours_bins, contours_start_times, contour_saliences, duration
def compute_pitch_yin(audio): yin = PitchYin() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('pitch_yin', yin(frame)[0]) return p['pitch_yin']
def compute_rms(audio): rms = RMS() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('rms', rms(frame)) return p['rms']
def compute_energy(audio): energy = Energy() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('energy', energy(frame)) return 'True'
def compute_zcr(audio): zcr = ZeroCrossingRate() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('zcr', zcr(frame)) return p['zcr']
def compute_power_spectrum(audio): w = Windowing(type='hann') power_spectrum = PowerSpectrum() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('power_spectrum', power_spectrum(w(frame))) return p['power_spectrum']
def compute_spectral_centroid(audio): w = Windowing(type='hann') spectrum = Spectrum() centroid = Centroid() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_centroid', centroid(spectrum(w(frame)))) return p['spectral_centroid']
def compute_bark(audio): w = Windowing(type='hann') spectrum = Spectrum() bark = BarkBands() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('bark', bark(spectrum(w(frame)))) return p['bark']
def compute_mel(audio): w = Windowing(type='hann') spectrum = Spectrum() mel = MelBands(numberBands=96) p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('mel', mel(spectrum(w(frame)))) return p['mel']
def compute_spectral_rolloff(audio): w = Windowing(type='hann') spectrum = Spectrum() rolloff = RollOff() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_rolloff', rolloff(spectrum(w(frame)))) return p['spectral_rolloff']
def compute_spectral_flatness(audio): w = Windowing(type='hann') spectrum = Spectrum() flatness = Flatness() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_flatness', flatness(spectrum(w(frame)))) return p['spectral_flatness']
def compute_pitch_yinfft(audio): w = Windowing(type='hann') spectrum = Spectrum() yinfft = PitchYinFFT() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('pitch_yinfft', yinfft(spectrum(w(frame)))[0]) return p['pitch_yinfft']
def estimate_danceability(infile): loader = streaming.MonoLoader(filename=infile) dance = streaming.Danceability() pool = Pool() loader.audio >> dance.signal dance.danceability >> (pool, 'danceability') run(loader) return pool['danceability']
def __init__(self, arch): self.architechture = arch self.in_layer = None self.out_layer = None if arch == 'musicnn': self.feature_extractor = es.TensorflowInputMusiCNN() self.frame_size = 512 self.hop_size = 256 self.patch_size = 187 self.num_bands = 96 elif arch == 'vggish': self.feature_extractor = es.TensorflowInputVGGish() self.frame_size = 400 self.hop_size = 200 self.patch_size = 96 self.num_bands = 64 self.feature_frames = [] self.in_pool = Pool() self.out_pool = Pool() # setup model self.predict = None
def compute_mfcc(audio): w = Windowing(type='hann') spectrum = Spectrum() mfcc = MFCC() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): _, coeffs = mfcc(spectrum(w(frame))) p.add('mfcc', coeffs) return p['mfcc']
def compute_spectral_shape(audio): w = Windowing(type='hann') spectrum = Spectrum() cm = CentralMoments() ds = DistributionShape() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): spread, skewness, kurtosis = ds(cm(spectrum(w(frame)))) p.add('spectral_spread', spread) p.add('spectral_skewness', skewness) p.add('spectral_kurtosis', kurtosis) return p['spectral_spread'], p['spectral_skewness'], p['spectral_kurtosis']
def chromaprint(self, analysisTime=30): """ This algorithm computes the fingerprint of the input signal using Chromaprint algorithm. It is a wrapper of the Chromaprint library Returns: The chromaprints are returned as base64-encoded strings. """ vec_input = ess.VectorInput(self.audio_vector) chromaprinter = ess.Chromaprinter(analysisTime=analysisTime, sampleRate=self.fs) pool = Pool() vec_input.data >> chromaprinter.signal chromaprinter.fingerprint >> (pool, 'chromaprint') run(vec_input) return pool['chromaprint']
def compute_hpcp(audio): w = Windowing(type='hann') spectrum = Spectrum() peaks = SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.00001, minFrequency=20, maxFrequency=3500, maxPeaks=60) hpcp = HPCP() p = Pool() for frame in FrameGenerator(audio, frameSize=tonal_frame_size, hopSize=tonal_hop_size, startFromZero=True): p.add('hpcp', hpcp(*peaks(spectrum(w(frame))))) return p['hpcp']
def __mfccs__(audio): w = Windowing(type='hann') spectrum = Spectrum( ) # FFT() would return the complex FFT, here we just want the magnitude spectrum mfcc = MFCC() mfcc_pool = Pool() ## NOTAS -> 1 segundo de un fichero wav son aprox 90 frames y la intensidad esta dada en Hz for frame in FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True): spec = spectrum(w(frame)) mfcc_bands, mfcc_coeffs = mfcc(spec) mfcc_pool.add('mfcc', mfcc_coeffs[1:]) mfcc_pool.add('mfcc_bands', mfcc_bands) return mfcc_pool
def computeOnsets(inFile, outFile): # don't forget, we can actually instantiate and call an algorithm on the same line! print 'Loading audio file...' audio = MonoLoader(filename=inFile)() pool = Pool() onsetDetectionGlobal = OnsetDetectionGlobal() onsetDetections = onsetDetectionGlobal(audio) pool.add('features.onsetDetections', onsetDetections) onsets = Onsets() onsetTimes = onsets(array([onsetDetections]), [1]) pool.add('features.onsets', onsetTimes) np.savetxt(outFile, pool['features.onsets'][0], fmt='%f')
def extractMFCCs(audio): ''' extract mfccs from spectromra ''' ######## compute MFCCs # maybe set highFrequencyBound=22100 frameSizeInSamples = int(round(44100 * frameSize_block)) hopSizeInSamples = int(round(44100 * hopSize_block)) inputSpectrumSize = frameSizeInSamples / 2 + 1 # inputSpectrumSize = 1025 mfcc = MFCC(numberCoefficients=num_mfccs, numberBands=numberBands, highFrequencyBound=highFrequencyBound, inputSize=inputSpectrumSize) w = Windowing(type='hann') spectrum = Spectrum() mfccs_array = [] pool = Pool() audio = essentia.array(audio) for frame in FrameGenerator(audio, frameSize=frameSizeInSamples, hopSize=hopSizeInSamples): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('mfcc', mfcc_coeffs) # mfccs_array = np.zeros( (len(spectogram), num_mfccs) ) # for i,spectrum in enumerate(spectogram): # # mfcc_bands, mfcc_coeffs = mfcc( spectrum ) # mfccs_array[i] = mfcc_coeffs # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs_T = essentia.array(pool['mfcc']).T # # and plot # imshow(mfccs_T, aspect = 'auto', interpolation='none') # show() # unnecessary if you started "ipython --pylab" return pool['mfcc']
def SliceDrums_BeatDetection(folder, audio_filename, fs): od_hfc = OnsetDetection(method='hfc') w = Windowing(type='hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar( ) # and this turns it into a pair (magnitude, phase) onsets = Onsets() x = MonoLoader(filename=folder + audio_filename, sampleRate=fs)() duration = float(len(x)) / fs x = x / np.max(np.abs(x)) t = np.arange(len(x)) / float(fs) zero_array = t * 0 #used only for plotting purposes #Plotting f, axarr = plt.subplots(1, 1, figsize=(80, 20)) #Essentia beat tracking pool = Pool() for frame in FrameGenerator(x, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(w(frame))) pool.add('features.hfc', od_hfc(mag, phase)) onsets_list = onsets(array([pool['features.hfc']]), [1]) axarr.vlines(onsets_list, -1, 1, color='k', zorder=2, linewidth=5.0) axarr.plot(t, x, zorder=1) axarr.axis('off') for i, onset in enumerate(onsets_list): sample = int(onset * fs) - 1000 samplename = "{}slices/{}{}__blind.wav".format(folder, str(len(str(i))), str(i)) if (i >= len(onsets_list) - 1): next_sample = len(x) else: next_sample = int(onsets_list[i + 1] * fs) - 1000 x_seg = x[sample:next_sample] MonoWriter(filename=samplename)(x_seg) return onsets_list, duration