def pca(pool, namespace=''): llspace = 'lowlevel.' if namespace: llspace = namespace + '.lowlevel.' sccoeffs = pool[llspace + 'sccoeffs'] scvalleys = pool[llspace + 'scvalleys'] numFrames = len(sccoeffs) poolSc = Pool() merged = essentia.zeros(2*len(sccoeffs[0])) for frame in xrange(numFrames): j = 0 for i in xrange(len(sccoeffs[frame])): merged[j]=sccoeffs[frame][i] merged[j+1]=scvalleys[frame][i] j+=2 poolSc.add('contrast', merged) poolTransformed = standard.PCA(namespaceIn='contrast', namespaceOut='contrast')(poolSc) contrast = poolTransformed['contrast'] pool.set(llspace+'spectral_contrast.mean', mean(contrast, axis=0)) pool.set(llspace+'spectral_contrast.var', var(contrast, axis=0)) pool.remove(llspace+'sccoeffs') pool.remove(llspace+'scvalleys')
def get_onsets(self, _audio=[]): if _audio != []: audio = _audio else: audio = self.audio W = es.Windowing(type=self.winType) c2p = es.CartesianToPolar() fft = es.FFT() onsetDetection = es.OnsetDetection(method=self.onsetMethod, sampleRate=44100) onsets = es.Onsets(alpha=.2) # onsetIndex = [] pool = Pool() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(W(frame))) onsetDetection.configure(method=self.onsetMethod) onsetFunction = onsetDetection(mag, phase) pool.add("onsetFunction", onsetFunction) DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1]) return DetectedOnsetsArray
def pca(pool, namespace=''): llspace = 'lowlevel.' if namespace: llspace = namespace + '.lowlevel.' sccoeffs = pool[llspace + 'sccoeffs'] scvalleys = pool[llspace + 'scvalleys'] numFrames = len(sccoeffs) poolSc = Pool() merged = essentia.zeros(2 * len(sccoeffs[0])) for frame in xrange(numFrames): j = 0 for i in xrange(len(sccoeffs[frame])): merged[j] = sccoeffs[frame][i] merged[j + 1] = scvalleys[frame][i] j += 2 poolSc.add('contrast', merged) poolTransformed = standard.PCA(namespaceIn='contrast', namespaceOut='contrast')(poolSc) contrast = poolTransformed['contrast'] pool.set(llspace + 'spectral_contrast.mean', mean(contrast, axis=0)) pool.set(llspace + 'spectral_contrast.var', var(contrast, axis=0)) pool.remove(llspace + 'sccoeffs') pool.remove(llspace + 'scvalleys')
def get_embeddings(melspecs: dict[str, np.ndarray], architectures: dict, predictors: dict) -> Optional[dict]: data = {} for architecture, metadata in architectures.items(): input_pool = Pool() input_pool.set('model/Placeholder', melspecs[metadata['essentia-algorithm']]) for dataset in metadata['datasets']: # TODO: chunk the input melspecs to avoid OOM error try: output_pool = predictors[f'{dataset}-{architecture}'](input_pool) except RuntimeError: return None for layer, layer_data in metadata['layers'].items(): embeddings = output_pool[layer_data['name']].squeeze() if len(embeddings) == 0: return None if len(embeddings.shape) == 1: embeddings = np.expand_dims(embeddings, axis=0) data[f'{dataset}-{architecture}-{layer}'] = embeddings return data
def get_onsets(self, in_filename): # print in_filename # Load the audio (in mono) audio, sampleRate, numChan = AudioLoader(filename=in_filename)() audio = MonoLoader(filename=in_filename)() self.sampleRate = sampleRate # 1) Compute onset detection functions od = OnsetDetection(method='rms') w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() pool_features = Pool() # print 'Computing onset detection functions' for frame in FrameGenerator(audio, frameSize=self.frame_size, hopSize=self.hop_size): mag, phase = c2p(fft(w(frame))) pool_features.add('features.rms', od(mag, phase)) # 2) Compute the onset locations onsets = Onsets(silenceThreshold=0.14, delay=10) # print 'Computing onset locations' onsets_rms = onsets( array([ pool_features['features.rms'] ]), [ 1 ]) print "Num onsets: " + str(len(onsets_rms)) return onsets_rms
def compute_pitch_yin(audio): yin = PitchYin() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('pitch_yin', yin(frame)[0]) return p['pitch_yin']
def compute_zcr(audio): zcr = ZeroCrossingRate() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('zcr', zcr(frame)) return p['zcr']
def compute_energy(audio): energy = Energy() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('energy', energy(frame)) return 'True'
def compute_rms(audio): rms = RMS() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('rms', rms(frame)) return p['rms']
def compute_power_spectrum(audio): w = Windowing(type='hann') power_spectrum = PowerSpectrum() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('power_spectrum', power_spectrum(w(frame))) return p['power_spectrum']
def _analyse(self, filepath): audio = to_mono(wavread(filepath)[0]) audio = audio.astype('float32') w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) hfc_detect = OnsetDetection(method = 'hfc') complex_detect = OnsetDetection(method = 'complex') rms_detect = RMS() spec = Spectrum() #pd = PitchDetection() flux = Flux() pool = Pool() #wap = WarpedAutoCorrelation() # let's get down to business print 'Computing onset detection functions...' for frame in FrameGenerator(audio, frameSize = self.frame_size,\ hopSize = self.hop_size): mag, phase, = c2p(fft(w(frame))) spectrum = spec(w(frame)) f = flux(spectrum) #pitch = pd(spectrum) pool.add('hfc', hfc_detect(mag, phase)) pool.add('complex', complex_detect(mag, phase)) pool.add('rms', rms_detect(frame)) pool.add('flux', f) #pool.add('pitch', pitch[0]) #print pool['pitch'] #pool.add('autoc', wap(pool['pitch'])) return pool, audio
def compute_spectral_flatness(audio): w = Windowing(type='hann') spectrum = Spectrum() flatness = Flatness() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_flatness', flatness(spectrum(w(frame)))) return p['spectral_flatness']
def compute_bark(audio): w = Windowing(type='hann') spectrum = Spectrum() bark = BarkBands() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('bark', bark(spectrum(w(frame)))) return p['bark']
def compute_mel(audio): w = Windowing(type='hann') spectrum = Spectrum() mel = MelBands(numberBands=96) p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('mel', mel(spectrum(w(frame)))) return p['mel']
def compute_pitch_yinfft(audio): w = Windowing(type='hann') spectrum = Spectrum() yinfft = PitchYinFFT() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('pitch_yinfft', yinfft(spectrum(w(frame)))[0]) return p['pitch_yinfft']
def compute_spectral_rolloff(audio): w = Windowing(type='hann') spectrum = Spectrum() rolloff = RollOff() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_rolloff', rolloff(spectrum(w(frame)))) return p['spectral_rolloff']
def detect_essentia(arquivo_audio,selected): #ODF using essentia library # try: filename = arquivo_audio except: print "usage:", sys.argv[0], "<audiofile>" sys.exit() # don't forget, we can actually instantiate and call an algorithm on the same line! global audio # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them if selected==3: od = OnsetDetection(method = 'hfc') elif selected==4: od = OnsetDetection(method = 'complex') elif selected==5: od = OnsetDetection(method = 'melflux') elif selected==6: od = OnsetDetection(method = 'complex_phase') elif selected==7: od = OnsetDetection(method = 'rms') # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print 'Computing onset detection functions...' for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.method', od(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print 'Computing onset times...' onsets_method = onsets(array([ pool['features.method'] ]), [ 1 ]) # and mark them on the audio, which we'll write back to disk # we use beeps instead of white noise to mark them, as it's more distinctive #convertendo para o tipo list listadet = onsets_method.tolist() #convertendo os segundos para frames listadet = [int(SecToFrames(x)) for x in listadet if x >= 0] return listadet
def compute_spectral_centroid(audio): w = Windowing(type='hann') spectrum = Spectrum() centroid = Centroid() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): p.add('spectral_centroid', centroid(spectrum(w(frame)))) return p['spectral_centroid']
def compute_mfcc(audio): w = Windowing(type='hann') spectrum = Spectrum() mfcc = MFCC() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): _, coeffs = mfcc(spectrum(w(frame))) p.add('mfcc', coeffs) return p['mfcc']
def multipool(): from multiprocessing import Pool print 'Loading audio file...' audio = MonoLoader(filename = sys.argv[1])() a = AudioInfo(sys.argv[1],3,10) b = AudioInfo(sys.argv[1],4,10) c = AudioInfo(sys.argv[1],6,10) todo = [] todo.append(a) todo.append(b) todo.append(c) pool = Pool(3) pool.map(detecta,todo)
def compute_hpcp(audio): w = Windowing(type='hann') spectrum = Spectrum() peaks = SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.00001, minFrequency=20, maxFrequency=3500, maxPeaks=60) hpcp = HPCP() p = Pool() for frame in FrameGenerator(audio, frameSize=tonal_frame_size, hopSize=tonal_hop_size, startFromZero=True): p.add('hpcp', hpcp(*peaks(spectrum(w(frame))))) return p['hpcp']
def get_key(file_in): """ Estimates the key and scale for an audio file. """ loader = streaming.MonoLoader(filename=file_in) framecutter = streaming.FrameCutter() windowing = streaming.Windowing(type="blackmanharris62") spectrum = streaming.Spectrum() spectralpeaks = streaming.SpectralPeaks(orderBy="magnitude", magnitudeThreshold=1e-05, minFrequency=40, maxFrequency=5000, maxPeaks=10000) pool = Pool() hpcp = streaming.HPCP() key = streaming.Key() loader.audio >> framecutter.signal framecutter.frame >> windowing.frame >> spectrum.frame spectrum.spectrum >> spectralpeaks.spectrum spectralpeaks.magnitudes >> hpcp.magnitudes spectralpeaks.frequencies >> hpcp.frequencies hpcp.hpcp >> key.pcp key.key >> (pool, 'tonal.key_key') key.scale >> (pool, 'tonal.key_scale') key.strength >> (pool, 'tonal.key_strength') run(loader) return Key(pool['tonal.key_key'], pool['tonal.key_scale'])
def computeBpmHistogram(self, noveltyCurve, frameSize=4, overlap=2, frameRate=44100. / 128., window='hann', zeroPadding=0, constantTempo=False, minBpm=30): pool = Pool() bpmHist = ess.BpmHistogram(frameRate=frameRate, frameSize=frameSize, overlap=overlap, zeroPadding=zeroPadding, constantTempo=constantTempo, windowType='hann', minBpm=minBpm) gen = ess.VectorInput(noveltyCurve) gen.data >> bpmHist.novelty bpmHist.bpm >> (pool, 'bpm') bpmHist.bpmCandidates >> (pool, 'bpmCandidates') bpmHist.bpmMagnitudes >> (pool, 'bpmMagnitudes') bpmHist.frameBpms >> None #(pool, 'frameBpms') bpmHist.tempogram >> (pool, 'tempogram') bpmHist.ticks >> (pool, 'ticks') bpmHist.ticksMagnitude >> (pool, 'ticksMagnitude') bpmHist.sinusoid >> (pool, 'sinusoid') essentia.run(gen) return pool
def get_bpm(file_in): pool = Pool() loader = streaming.MonoLoader(filename=file_in) bt = streaming.RhythmExtractor2013() bpm_histogram = streaming.BpmHistogramDescriptors() # BPM histogram output size is 250 centroid = streaming.Centroid(range=250) loader.audio >> bt.signal bt.bpm >> (pool, 'bpm') bt.ticks >> None bt.confidence >> (pool, 'confidence') bt.estimates >> None bt.bpmIntervals >> bpm_histogram.bpmIntervals bpm_histogram.firstPeakBPM >> (pool, 'bpm_first_peak') bpm_histogram.firstPeakWeight >> None bpm_histogram.firstPeakSpread >> None bpm_histogram.secondPeakBPM >> (pool, 'bpm_second_peak') bpm_histogram.secondPeakWeight >> None bpm_histogram.secondPeakSpread >> None bpm_histogram.histogram >> (pool, 'bpm_histogram') bpm_histogram.histogram >> centroid.array centroid.centroid >> (pool, 'bpm_centroid') run(loader) return pool['bpm']
def estimate_main_band(infile): """ Estimate if this is a low, mid, or high track. Not _really_ sure if this does what I need it to, but some quick tests looked right. """ loader = streaming.MonoLoader(filename=infile) framecutter = streaming.FrameCutter() windowing = streaming.Windowing(type="blackmanharris62") spectrum = streaming.Spectrum() freqbands = streaming.FrequencyBands(frequencyBands=[0, 250, 750, 4000]) pool = Pool() loader.audio >> framecutter.signal framecutter.frame >> windowing.frame >> spectrum.frame spectrum.spectrum >> freqbands.spectrum freqbands.bands >> (pool, 'bands') run(loader) sums = np.sum(pool['bands'], axis=0) band = np.argmax(sums) if band == 0: return 'low' elif band == 1: return 'mid' elif band == 2: return 'high'
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing(zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum(size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = estd.PitchSalienceFunction( binResolution=self.bin_resolution) run_pitch_salience_function_peaks = estd.PitchSalienceFunctionPeaks( binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator(audio, frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( pool['allframes_salience_peaks_bins'], pool['allframes_salience_peaks_contourSaliences']) return contours_bins, contours_start_times, contour_saliences, duration
def extractMFCCs(audio): ''' extract mfccs from spectromra ''' ######## compute MFCCs # maybe set highFrequencyBound=22100 frameSizeInSamples = int(round(44100 * frameSize_block)) hopSizeInSamples = int(round(44100 * hopSize_block)) inputSpectrumSize = frameSizeInSamples / 2 + 1 # inputSpectrumSize = 1025 mfcc = MFCC(numberCoefficients=num_mfccs, numberBands=numberBands, highFrequencyBound=highFrequencyBound, inputSize=inputSpectrumSize) w = Windowing(type='hann') spectrum = Spectrum() mfccs_array = [] pool = Pool() audio = essentia.array(audio) for frame in FrameGenerator(audio, frameSize=frameSizeInSamples, hopSize=hopSizeInSamples): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) pool.add('mfcc', mfcc_coeffs) # mfccs_array = np.zeros( (len(spectogram), num_mfccs) ) # for i,spectrum in enumerate(spectogram): # # mfcc_bands, mfcc_coeffs = mfcc( spectrum ) # mfccs_array[i] = mfcc_coeffs # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs_T = essentia.array(pool['mfcc']).T # # and plot # imshow(mfccs_T, aspect = 'auto', interpolation='none') # show() # unnecessary if you started "ipython --pylab" return pool['mfcc']
def SliceDrums_BeatDetection(folder, audio_filename, fs): od_hfc = OnsetDetection(method='hfc') w = Windowing(type='hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar( ) # and this turns it into a pair (magnitude, phase) onsets = Onsets() x = MonoLoader(filename=folder + audio_filename, sampleRate=fs)() duration = float(len(x)) / fs x = x / np.max(np.abs(x)) t = np.arange(len(x)) / float(fs) zero_array = t * 0 #used only for plotting purposes #Plotting f, axarr = plt.subplots(1, 1, figsize=(80, 20)) #Essentia beat tracking pool = Pool() for frame in FrameGenerator(x, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(w(frame))) pool.add('features.hfc', od_hfc(mag, phase)) onsets_list = onsets(array([pool['features.hfc']]), [1]) axarr.vlines(onsets_list, -1, 1, color='k', zorder=2, linewidth=5.0) axarr.plot(t, x, zorder=1) axarr.axis('off') for i, onset in enumerate(onsets_list): sample = int(onset * fs) - 1000 samplename = "{}slices/{}{}__blind.wav".format(folder, str(len(str(i))), str(i)) if (i >= len(onsets_list) - 1): next_sample = len(x) else: next_sample = int(onsets_list[i + 1] * fs) - 1000 x_seg = x[sample:next_sample] MonoWriter(filename=samplename)(x_seg) return onsets_list, duration
def estimate_danceability(infile): loader = streaming.MonoLoader(filename=infile) dance = streaming.Danceability() pool = Pool() loader.audio >> dance.signal dance.danceability >> (pool, 'danceability') run(loader) return pool['danceability']
def get_cat_audio_pitch(): spectrum = Spectrum() pitch = PitchYinFFT(frameSize=1024) pool = Pool() windowing = Windowing(type = 'hann') cat_audio = MonoLoader(filename='cat-01.wav', sampleRate=44100)() cat_audio_loudness = Loudness()(cat_audio) for frame in FrameGenerator(cat_audio, frameSize=1024, hopSize=512): spec = spectrum(windowing(frame)) p, conf = pitch(spec) pool.add('cat_pitch', p) cat_pitch = numpy.mean(pool['cat_pitch']) cat_MIDI = mir_eval.multipitch.frequencies_to_midi([cat_pitch]) return cat_audio, cat_MIDI[0]
def __init__(self, arch): self.architechture = arch self.in_layer = None self.out_layer = None if arch == 'musicnn': self.feature_extractor = es.TensorflowInputMusiCNN() self.frame_size = 512 self.hop_size = 256 self.patch_size = 187 self.num_bands = 96 elif arch == 'vggish': self.feature_extractor = es.TensorflowInputVGGish() self.frame_size = 400 self.hop_size = 200 self.patch_size = 96 self.num_bands = 64 self.feature_frames = [] self.in_pool = Pool() self.out_pool = Pool() # setup model self.predict = None
def compute_harmonic_magnitudes(contour_f0s, fftgram, idx_start, options): ''' Compute for each frame harm amplitude get harmonic partials form original spectrum Params: -------------------- fftgram - fftgram of whole audio file times - ts of whole audio hfreq - harmonics of contour magns - magns of contour ''' run_harm_model_anal = HarmonicModelAnal(nHarmonics=30) # TODO: sanity check: times == len(fftgram) and contour_start_time_SAL in times pool = Pool() for i, contour_f0 in enumerate(contour_f0s): if idx_start + i > len(fftgram) - 1: sys.exit('idx start is {} while len ffmtgram is {}'.format( idx_start, len(fftgram))) fft = fftgram[idx_start + i] # convert to freq : hfreq, magn, phase = run_harm_model_anal(fft, contour_f0) pool.add('phases', phase) pool.add('hfreqs', hfreq) pool.add('magns', magn) return pool['hfreqs'], pool['magns'], pool['phases']
def harmonic_magnitudes_to_audio(hfreqs, magns, phases, options): ''' Compute for each frame harm amplitude convert cent bins to herz get harmonic partials form original spectrum Params: hfreq - harmonics of contour magns - magns of contour return: spectogram contour out_audio_contour - audio of harmonics for a contour ''' pool = Pool() run_sine_model_synth = SineModelSynth(hopSize=512, sampleRate=options.Fs) run_ifft = IFFT(size=options.windowsizeInSamples) run_overl = OverlapAdd(frameSize=options.windowsizeInSamples, hopSize=512, gain=1. / options.windowsizeInSamples) out_audio_contour = np.array(0) for hfreq, hmag, hphase in zip(hfreqs, magns, phases): spectrum, audio_frame = harmonics_to_audio(hfreq, hmag, hphase, run_sine_model_synth, run_ifft, run_overl) out_audio_contour = np.append(out_audio_contour, audio_frame) pool.add('spectrum', spectrum) out_audio_contour = SM.sineModelSynth(hfreqs, magns, phases, 512, 128, 44100) return out_audio_contour, pool['spectrum']
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing( # pylint: disable-msg=E1101 zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum( # pylint: disable-msg=E1101 size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( # pylint: disable-msg=E1101 minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = \ estd.PitchSalienceFunction( # pylint: disable-msg=E1101 binResolution=self.bin_resolution) run_pitch_salience_function_peaks = \ estd.PitchSalienceFunctionPeaks( # pylint: disable-msg=E1101 binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( # pylint: disable-msg=E1101 hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator( audio, # pylint: disable-msg=E1101 frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( [f.tolist() for f in pool['allframes_salience_peaks_bins']], [f.tolist() for f in pool['allframes_salience_peaks_contourSaliences']]) return contours_bins, contours_start_times, contour_saliences, duration
def spectrogram(audio, audio_file, save_fig=True, save_fig_path=None): if audio_file.endswith('.wav'): w = Windowing(type='hann') spectrum = Spectrum( ) # FFT() would return the complex FFT, here we just want the magnitude spectrum pool = Pool() ## NOTAS -> 1 segundo de un fichero wav son aprox 90 frames y la intensidad esta dada en Hz for frame in FrameGenerator(audio, frameSize=1024, hopSize=512, startFromZero=True): win = w(frame) spec = spectrum(win) pool.add('spec', spec) aggrPool = PoolAggregator(defaultStats=['mean'])(pool) a = sum(aggrPool['spec.mean'].T) / aggrPool['spec.mean'].T.shape[0] # a = aggrPool['spec.mean'].T # b = np.zeros(pool['spec'].T.shape) b = np.array(pool['spec'].T) # for iterator1, i in enumerate(pool['spec'].T): # for iterator2, j in enumerate(i): # # if j > a[iterator1]/2: # if j > a/2 and j > 0.015: # b[iterator1][iterator2] = j # b = np.array([i for i in b if i.max() > 0.01]) # no para el nuevo dataset de bats b = remove_initial_zeros(b) b = b.tolist() b.reverse() b = remove_initial_zeros(b) b.reverse() b = np.array(b) if save_fig: if not save_fig_path: save_fig_path = audio_file.replace('.wav', '_spec.jpg') save_plots(b, save_fig_path) return b[:200, :200].tolist()
def chromaprint(self, analysisTime=30): """ This algorithm computes the fingerprint of the input signal using Chromaprint algorithm. It is a wrapper of the Chromaprint library Returns: The chromaprints are returned as base64-encoded strings. """ vec_input = ess.VectorInput(self.audio_vector) chromaprinter = ess.Chromaprinter(analysisTime=analysisTime, sampleRate=self.fs) pool = Pool() vec_input.data >> chromaprinter.signal chromaprinter.fingerprint >> (pool, 'chromaprint') run(vec_input) return pool['chromaprint']
def compute_spectral_shape(audio): w = Windowing(type='hann') spectrum = Spectrum() cm = CentralMoments() ds = DistributionShape() p = Pool() for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): spread, skewness, kurtosis = ds(cm(spectrum(w(frame)))) p.add('spectral_spread', spread) p.add('spectral_skewness', skewness) p.add('spectral_kurtosis', kurtosis) return p['spectral_spread'], p['spectral_skewness'], p['spectral_kurtosis']
- bin 0 = 0 BPM - bin 128 = 645.99609375 BPM """ sampleRate = 22050 frameSize = 8192 hopSize = 1024 rmsFrameSize = 256 rmsHopSize = 32 loader = MonoLoader(filename=input_file, sampleRate=sampleRate) w = Windowing(type='blackmanharris62') spectrum = Spectrum() melbands = MelBands(sampleRate=sampleRate, numberBands=40, lowFrequencyBound=0, highFrequencyBound=sampleRate/2) pool = Pool() for frame in FrameGenerator(audio=loader(), frameSize=frameSize, hopSize=hopSize, startFromZero=True): bands = melbands(spectrum(w(frame))) pool.add('melbands', bands) rhythmtransform = RhythmTransform(frameSize=rmsFrameSize, hopSize=rmsHopSize) rt = rhythmtransform(pool['melbands']) rt_mean = numpy.mean(rt, axis=0) bin_resoluion = 5.007721656976744 print numpy.argmax(rt_mean) * bin_resoluion
if __name__ == '__main__': opt, args = parse_args() if len(args) != 2: #3: print "Incorrect number of arguments\n", essentia_usage sys.exit(1) #profile = args[0] input_file = args[0] output_file = args[1] pool = Pool() startTime = float(opt.startTime) endTime = float(opt.endTime) # compute descriptors readMetadata(input_file, pool) INFO('Process step 1: Replay Gain') replaygain.compute(input_file, pool, startTime, endTime) segments_namespace=[] if opt.segmentation: INFO('Process step 2: Low Level') computeLowLevel(input_file, pool, startTime, endTime) segmentation.compute(input_file, pool, startTime, endTime) segments = pool['segmentation.timestamps']
if __name__ == '__main__': opt, args = parse_args() if len(args) != 2: #3: print "Incorrect number of arguments\n", essentia_usage sys.exit(1) #profile = args[0] input_file = args[0] output_file = args[1] neqPool = Pool() eqPool = Pool() startTime = float(opt.startTime) endTime = float(opt.endTime) # compute descriptors readMetadata(input_file, eqPool) INFO('Process step 1: Replay Gain') replaygain.compute(input_file, eqPool, startTime, endTime) segments_namespace=[] neqPool.merge(eqPool, 'replace') if opt.segmentation: INFO('Process step 2: Low Level') computeLowLevel(input_file, neqPool, eqPool, startTime, endTime)
def run(self, fname): citation = u""" Atlı, H. S., Uyar, B., Şentürk, S., Bozkurt, B., and Serra, X. (2014). Audio feature extraction for exploring Turkish makam music. In Proceedings of 3rd International Conference on Audio Technologies for Music and Media, Ankara, Turkey. """ run_windowing = Windowing(zeroPadding = 3 * self.settings.frameSize) # Hann window with x4 zero padding run_spectrum = Spectrum(size=self.settings.frameSize * 4) run_spectral_peaks = SpectralPeaks(minFrequency=self.settings.minFrequency, maxFrequency = self.settings.maxFrequency, maxPeaks = self.settings.maxPeaks, sampleRate = self.settings.sampleRate, magnitudeThreshold = self.settings.magnitudeThreshold, orderBy = 'magnitude') run_pitch_salience_function = PitchSalienceFunction(binResolution=self.settings.binResolution) # converts unit to cents, 55 Hz is taken as the default reference run_pitch_salience_function_peaks = PitchSalienceFunctionPeaks(binResolution=self.settings.binResolution) run_pitch_contours = PitchContours(hopSize=self.settings.hopSize, binResolution=self.settings.binResolution, peakDistributionThreshold = self.settings.peakDistributionThreshold) pool = Pool(); # load audio and eqLoudness audio = MonoLoader(filename = fname)() # MonoLoader resamples the audio signal to 44100 Hz by default audio = EqualLoudness()(audio) for frame in FrameGenerator(audio,frameSize=self.settings.frameSize, hopSize=self.settings.hopSize): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contourSaliences = run_pitch_salience_function_peaks(salience) if not size(salience_peaks_bins): salience_peaks_bins = array([0]) if not size(salience_peaks_contourSaliences): salience_peaks_contourSaliences = array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contourSaliences) # post-processing: contour tracking contours_bins, contours_contourSaliences, contours_start_times, duration = run_pitch_contours( pool['allframes_salience_peaks_bins'], pool['allframes_salience_peaks_contourSaliences']) # run the simplified contour selection #[pitch, pitch_salience] = self.ContourSelection(contours_bins,contours_contourSaliences,contours_start_times,duration) # cent to Hz conversion #pitch = [0. if p == 0 else 55.*(2.**(((self.settings.binResolution*(p)))/1200)) for p in pitch] # generate time stamps #time_stamps = [s*self.settings.hopSize/float(self.settings.sampleRate) for s in xrange(0,len(pitch))] # [time pitch salience] matrix #out = transpose(vstack((time_stamps, pitch, pitch_salience))) #out = out.tolist() # settings settings = self.settings settings.update({'version':self.__version__, 'slug':self.__slug__, 'source': fname, 'essentiaVersion': essentia.__version__, 'pitchUnit': 'Hz', 'citation': citation}) # matlab #matout = cStringIO.StringIO() #matob = {'pitch': out} #matob.update(settings) #scipy.io.savemat(matout, matob) #return out # unused #return {'pitch': json.dumps(out), # 'matlab': matout.getvalue(), # 'settings': json.dumps(settings)} return contours_bins, contours_contourSaliences, contours_start_times, duration
essentia.run(loader) if __name__ == "__main__": opt, args = parse_args() if len(args) != 2: # 3: print "Incorrect number of arguments\n", essentia_usage sys.exit(1) # profile = args[0] input_file = args[0] output_file = args[1] neqPool = Pool() eqPool = Pool() startTime = float(opt.startTime) endTime = float(opt.endTime) # compute descriptors readMetadata(input_file, eqPool) INFO("Process step 1: Replay Gain") replaygain.compute(input_file, eqPool, startTime, endTime) segments_namespace = [] neqPool.merge(eqPool, "replace") if opt.segmentation: INFO("Process step 2: Low Level") computeLowLevel(input_file, neqPool, eqPool, startTime, endTime)
def computeOnsets(inFile, outFile): print outFile # In this example we are going to look at how to perform some onset detection # and mark them on the audio using the AudioOnsetsMarker algorithm. # # Onset detection consists of two main phases: # 1- we need to compute an onset detection function, which is a function # describing the evolution of some parameters, which might be representative # of whether we might find an onset or not # 2- performing the actual onset detection, that is given a number of these # detection functions, decide where in the sound there actually are onsets # we're going to work with a file specified as an argument in the command line # try: # filename = sys.argv[1] # except: # print "usage:", sys.argv[0], "<audiofile>" # sys.exit() # don't forget, we can actually instantiate and call an algorithm on the same line! print "Loading audio file..." audio = MonoLoader(filename=inFile)() # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them od1 = OnsetDetection(method="hfc") od2 = OnsetDetection(method="complex") # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type="hann") fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print "Computing onset detection functions..." for frame in FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(w(frame))) pool.add("features.hfc", od1(mag, phase)) pool.add("features.complex", od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print "Computing onset times..." onsets_hfc = onsets( # this algo expects a matrix, not a vector array([pool["features.hfc"]]), # you need to specify weights, but as there is only a single # function, it doesn't actually matter which weight you give it [1], ) np.savetxt(outFile, onsets_hfc, fmt="%f") # Let's just take the complex as an example onsets_complex = onsets(array([pool["features.complex"]]), [1]) np.savetxt(outFile, onsets_complex, fmt="%f")
audio = MonoLoader(filename=filename)() # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them od1 = OnsetDetection(method="hfc") od2 = OnsetDetection(method="complex") # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type="hann") fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print "Computing onset detection functions..." for frame in FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(w(frame))) pool.add("features.hfc", od1(mag, phase)) pool.add("features.complex", od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print "Computing onset times..." onsets_hfc = onsets( # this algo expects a matrix, not a vector array([pool["features.hfc"]]),
def find_files(directory, pattern): for root, dirs, files in os.walk(directory): for basename in files: if basename.lower().endswith(pattern): filename = os.path.join(root, basename) yield filename try: indir = sys.argv[1] result_file = sys.argv[2] except: print "usage:", sys.argv[0], "<input-directory> <result.json>" sys.exit() result = Pool() files = [f for f in find_files(indir, FILE_EXT)] print 'Found', len(files), 'audio files (' + '/'.join(FILE_EXT) + ')' i = 0 for filename in files: i += 1 print 'Extracting metadata:', filename namespace = 'track_' + str(i) try: meta = MetadataReader(filename=filename, failOnError=True, tagPoolName=namespace + '.metadata')() pool_meta, duration, bitrate, samplerate, channels = meta[7:] pool_meta.set(namespace + ".file_path", os.path.relpath(filename)) pool_meta.set(namespace + ".duration", duration) pool_meta.set(namespace + ".bit_rate", bitrate)
audio = MonoLoader(filename = filename)() # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them od1 = OnsetDetection(method = 'hfc') od2 = OnsetDetection(method = 'complex') # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print 'Computing onset detection functions...' for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.hfc', od1(mag, phase)) pool.add('features.complex', od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print 'Computing onset times...' onsets_hfc = onsets(# this algo expects a matrix, not a vector array([ pool['features.hfc'] ]),
def extractFeatures(arffDir = '.', dirname = '.', fnames = '', segment_length = 'WHOLE', hopsize = 0): # Start to process file by file from input for fname in fnames: # It only process wav or mp3 file if ".wav" not in fname.lower() and ".mp3" not in fname.lower(): continue # Generate output dir trackName = fname.split('/')[-1] segmentArffDir = arffDir+"/"+trackName[:-4]+"/" if not exists(segmentArffDir): mkdir(segmentArffDir) else: print fname + ' exsits, pass...' continue # Read audio and some more info loader = es.EasyLoader(filename = dirname+"/"+fname) audio = loader.compute() sampleRate = loader.paramValue('sampleRate') length = int(len(audio)/sampleRate) if length == 0: length = 1 print fname + ' length: ' + str(length) if hopsize == 0: hopsize = segment_length # Specify the length of the segment if segment_length == 'WHOLE': step = length end_time = length segment_length = length print 'The whole audio is being processed...' else: step = hopsize segment_length = float(segment_length) if step>length: continue # Start computing segment by segment for start_time in arange(0, length, step): end_time = start_time + segment_length if step != length: print 'the time from second ' + str(start_time) + ' is being processed...' if end_time > length: break; segAudio = audio[start_time*sampleRate:end_time*sampleRate] pool = Pool() # Setup parameters specContrast = es.SpectralContrast(frameSize=2048, lowFrequencyBound=40, sampleRate=sampleRate) spectrum = es.Spectrum(size=2048) #size is frameSize mfcc = es.MFCC(lowFrequencyBound=40, sampleRate=sampleRate) # MFCC if step > 20: hpcp = es.HPCP(size = 12, referenceFrequency = 440, harmonics=8, bandPreset = True, minFrequency = 40.0, maxFrequency = 5000.0, \ splitFrequency = 500.0, weightType = 'cosine', nonLinear = False, windowSize = 1);# HPCP lowLevelSpectralExtractor = \ es.LowLevelSpectralExtractor(frameSize=2048, hopSize=1024, sampleRate=sampleRate) spectralPeaks = es.SpectralPeaks(sampleRate=sampleRate, minFrequency=40, maxFrequency=11000, maxPeaks=50, magnitudeThreshold=0.2) # Low level spectral feature analysis try: features = lowLevelSpectralExtractor(segAudio) except: print start_time, "has failed!" continue # Harmonic spectral features (TODO: Is the magnitude threshold ok?) harmonicPeaks = es.HarmonicPeaks() pitch = es.PitchDetection() # Using YIN instead of predominant pitch analysis as this frame-based analysis # Windowing window = es.Windowing(size=2048) for frame in es.FrameGenerator(segAudio, frameSize=2048, hopSize=1024): # spectral contrast s = spectrum(window(frame)) contrast, valley = specContrast(s) pool.add('spectral_contrast', contrast) pool.add('spectral_valley', valley) # MFCC bands, mfccs = mfcc(s) pool.add('mfcc', mfccs[1:]) freqs, mags = spectralPeaks(s) # HPCP if step > 20: hpcps = hpcp(freqs, mags) pool.add('HPCP', hpcps) # Self-compute spectral features if len(freqs) > 0: p, conf = pitch(s) if freqs[0] == 0: freqs = freqs[1:] mags = mags[1:] freqs, mags = harmonicPeaks(freqs, mags, p) _sum = 0 if len(freqs) == 1: specEnvelope_i = [freqs[0]] #for hsd _sum = freqs[0]*mags[0] elif len(freqs) == 2: specEnvelope_i = [(freqs[0]+freqs[1])/2.0] #for hsd _sum = freqs[0]*mags[0]+freqs[1]*mags[1] elif len(freqs) > 2: specEnvelope_i = [(freqs[0]+freqs[1])/2.0] #for hsd _sum = freqs[0]*mags[0] for i in xrange(1, len(freqs)-1): _sum += freqs[i]*mags[i] #for hsc_i specEnvelope_i.append((freqs[i-1]+freqs[i]+freqs[i+1])/3.0) specEnvelope_i.append((freqs[i]+freqs[i+1])/2.0) _sum += freqs[i+1]*mags[i+1] hsc_i = _sum/sum(mags) pool.add('harmonic_spectral_centroid', hsc_i) hsd_i = sum(abs(log10(mags)-log10(specEnvelope_i)))/sum(log10(mags)) pool.add('harmonic_spectral_deviation', hsd_i) hss_i = sqrt(sum(square(freqs-hsc_i)*square(mags))/sum(square(mags)))/hsc_i pool.add('harmonic_spectral_spread', hss_i) else: pool.add('harmonic_spectral_centroid', 0) pool.add('harmonic_spectral_deviation', 0) pool.add('harmonic_spectral_spread', 0) for i in xrange(0, len(features[0])): # pool.add('barkbands', features[0][i]) pool.add('hfc', features[4][i]) pool.add('pitch', features[6][i]) pool.add('pitch_instantaneous_confidence', features[7][i]) pool.add('pitch_salience', features[8][i]) pool.add('silence_rate_20dB', features[9][i]) # pool.add('silence_rate_30dB', features[10][i]) # pool.add('silence_rate_60dB', features[11][i]) pool.add('spectral_complexity', features[12][i]) pool.add('spectral_crest', features[13][i]) pool.add('spectral_decrease', features[14][i]) pool.add('spectral_energy', features[15][i]) # pool.add('spectral_energyband_low', features[16][i]) # pool.add('spectral_energyband_middle_low', features[17][i]) # pool.add('spectral_energyband_middle_high', features[18][i]) # pool.add('spectral_energy_high', features[19][i]) pool.add('spectral_flatness_db', features[20][i]) pool.add('spectral_flux', features[21][i]) pool.add('spectral_rms', features[22][i]) pool.add('spectral_rolloff', features[23][i]) pool.add('spectral_strongpeak', features[24][i]) pool.add('zero_crossing_rate', features[25][i]) pool.add('inharmonicity', features[26][i]) pool.add('tristimulus', features[27][i]) onsetRate = es.OnsetRate() onsets, rate = onsetRate(segAudio) try: aggrPool = es.PoolAggregator(defaultStats = ['mean', 'var', 'skew', 'kurt'])(pool) except: print start_time/step, "failed" continue aggrPool.add('onset_rate', rate) #print start_time, segment_length, start_time/segment_length fileout = segmentArffDir+trackName[:-4]+"_%003d%s"%(start_time/step, ".sig") output = es.YamlOutput(filename = fileout) output(aggrPool)
def run(self, fname): citation = u'Atlı, H. S., Uyar, B., Şentürk, S., Bozkurt, B., and Serra, X. ' \ '(2014). Audio feature extraction for exploring Turkish makam music. ' \ 'In Proceedings of 3rd International Conference on Audio Technologies ' \ 'for Music and Media, Ankara, Turkey.' run_windowing = Windowing(zeroPadding = 3 * self.settings.frameSize) # Hann window with x4 zero padding run_spectrum = Spectrum(size=self.settings.frameSize * 4) run_spectral_peaks = SpectralPeaks(minFrequency=self.settings.minFrequency, maxFrequency = self.settings.maxFrequency, maxPeaks = self.settings.maxPeaks, sampleRate = self.settings.sampleRate, magnitudeThreshold = self.settings.magnitudeThreshold, orderBy = 'magnitude') run_pitch_salience_function = PitchSalienceFunction(binResolution=self.settings.binResolution) # converts unit to cents, 55 Hz is taken as the default reference run_pitch_salience_function_peaks = PitchSalienceFunctionPeaks(binResolution=self.settings.binResolution) run_pitch_contours = PitchContours(hopSize=self.settings.hopSize, binResolution=self.settings.binResolution, peakDistributionThreshold = self.settings.peakDistributionThreshold) pool = Pool(); # load audio and eqLoudness audio = MonoLoader(filename = fname)() # MonoLoader resamples the audio signal to 44100 Hz by default audio = EqualLoudness()(audio) for frame in FrameGenerator(audio,frameSize=self.settings.frameSize, hopSize=self.settings.hopSize): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contourSaliences = run_pitch_salience_function_peaks(salience) if not size(salience_peaks_bins): salience_peaks_bins = array([0]) if not size(salience_peaks_contourSaliences): salience_peaks_contourSaliences = array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contourSaliences) # post-processing: contour tracking contours_bins, contours_contourSaliences, contours_start_times, duration = run_pitch_contours( pool['allframes_salience_peaks_bins'], pool['allframes_salience_peaks_contourSaliences']) # WARNING: As of 3 April 2015, the values in "contours_start_times" leads the audio # by 1024 + 128 samples if the read audio is in mp3 format as explained in # https://github.com/MTG/essentia/issues/246. This roots because of the typical # encoder/decoder problems. For now We are advancing the values in "contours_start_times" # by 1152 samples. Uncomment the next line if this problem is fixed. contours_start_times = [c + (1024+128)/float(self.settings.sampleRate) for c in contours_start_times] # run the simplified contour selection [pitch, pitch_salience] = self.ContourSelection(contours_bins,contours_contourSaliences,contours_start_times,duration) # cent to Hz conversion pitch = [0. if p == 0 else 55.*(2.**(((self.settings.binResolution*(p)))/1200)) for p in pitch] # generate time stamps time_stamps = [s*self.settings.hopSize/float(self.settings.sampleRate) for s in xrange(0,len(pitch))] # [time pitch salience] matrix out = transpose(vstack((time_stamps, pitch, pitch_salience))) out = out.tolist() # settings settings = self.settings settings.update({'version':self.__version__, 'slug':self.__slug__, 'source': fname, 'essentiaVersion': essentia.__version__, 'pitchUnit': 'Hz', 'citation': citation}) # matlab matout = cStringIO.StringIO() matob = {'pitch': out} matob.update(settings) scipy.io.savemat(matout, matob) return {'pitch': json.dumps(out), 'matlab': matout.getvalue(), 'settings': json.dumps(settings)}
def deteccoes(arquivo_audio): #Return a list with all detections try: filename = sys.argv[1] except: print "usage:", sys.argv[0], "<audiofile>" sys.exit() audio = MonoLoader(filename = filename)() # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them od1 = OnsetDetection(method = 'hfc') od2 = OnsetDetection(method = 'complex') # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print 'Computing onset detection functions...' for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.hfc', od1(mag, phase)) pool.add('features.complex', od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print 'Computing onset times...' onsets_hfc = onsets(# this algo expects a matrix, not a vector array([ pool['features.hfc'] ]), # you need to specify weights, but as there is only a single # function, it doesn't actually matter which weight you give it [ 1 ]) onsets_complex = onsets(array([ pool['features.complex'] ]), [ 1 ]) # and mark them on the audio, which we'll write back to disk # we use beeps instead of white noise to mark them, as it's more distinctive print 'Writing audio files to disk with onsets marked...' # mark the 'hfc' onsets: #convertendo para o tipo list listadethfc = onsets_hfc.tolist() listadetcomplex = onsets_complex.tolist() #convertendo os segundos para frames listadethfc = [int(SecToFrames(x)) for x in listadethfc if x >= 0] listadetcomplex = [int(SecToFrames(x)) for x in listadetcomplex if x >= 0] return listadetcomplex