def handle(self, audio: np.array) -> np.array: left, right = self.to_stereo(audio) left = array(self.apply_delay(left)) right = array(self.apply_delay(right)) return self.to_mono(left, right)
def extractHPCP(audiosignal, frameSize, hopSize, w, speaks, hpcp, signalname): # w is the preconfigured windowing algorithm # hpcp is the preconfigured HPCP algorithm audio = essentia.array(audiosignal) # TODO: not sure if this is necessary: if len(audio)%2: audio = audio[:-1] spectrum = Spectrum() speaks.maxFrequency = hpcp.paramValue('maxFrequency') chromagram = [] spectrogram = [] signal_spectrum = spectrum(audio) for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): frame_spectrum = spectrum(w(frame)) spectrogram.append(frame_spectrum) pfreq, pmagn = speaks(frame_spectrum) chromagram.append(hpcp(pfreq, pmagn)) spectrogram = essentia.array(spectrogram).T chromagram = essentia.array(chromagram).T hpcp_mean = np.mean(chromagram, axis=1) hpcp_median = np.median(chromagram, axis=1) return chromagram, spectrogram, signal_spectrum, hpcp_mean, hpcp_median
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector and apply power-law compression ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft ###############################OPTIMIZATION[[[[[[[[[[[[[[]]]]]]]]]]]]]] #DEPRECATED ################################################# SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression return SP
class TestCrossSimilarityMatrix(TestCase): # hpcp matrix of a short query song segment (2 frames) computed using essentia hpcp algorithm query_feature = array([[0.3218126, 0.00541916, 0.26444072, 0.36874822, 1., 0.10472599, 0.05123469, 0.03934194, 0.07354275, 0.646091, 0.55201685, 0.03270169], [0.07695414, 0.04679213, 0.56867135, 1., 0.10247268, 0.03653419, 0.03635696, 0.2443251, 0.2396715, 0.1190474, 0.8045795, 0.41822678]]) # hpcp matrix of a short reference song segment (3 frames) computed using essentia hpcp algorithm reference_feature = array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], [0.36084786, 0.37151814, 0.40913638, 0.15566002, 0.40571737, 1., 0.6263613, 0.65415925, 0.53127843, 0.7900088, 0.50427467, 0.51956046], [0.42861825, 0.36887613, 0.05665652, 0.20978431, 0.1992704, 0.14884946, 1., 0.24148795, 0.43031794, 0.14265466, 0.17224492, 0.36498153]]) # expected euclidean pairwise similarity matrix without binary thresholding (pre-computed using a python script adopted from https://github.com/albincorreya/ChromaCoverId/blob/master/cover_similarity_measures.py) expected_sim_matrix = [[1.432924 , 1.5921365, 1.5593135], [1.5159905, 1.7596511, 1.5824637]] # expected euclidean pairwise similarity matrix with binary thresholding where binarizePercentile=0.095, frameStackStride=1 and frameStackSize=1 (pre-computed using a python script adopted from https://github.com/albincorreya/ChromaCoverId/blob/master/cover_similarity_measures.py) expected_sim_matrix_binary = [[1., 0., 0.], [0., 0., 0.]] def testEmpty(self): self.assertComputeFails(CrossSimilarityMatrix(), [], []) def testRegressionStandard(self): csm = CrossSimilarityMatrix(binarize=False, frameStackStride=1, frameStackSize=1) result = csm(self.query_feature, self.reference_feature) self.assertAlmostEqualMatrix(self.expected_sim_matrix, result) def testRegressionBinary(self): csm = CrossSimilarityMatrix(binarize=True, binarizePercentile=0.095, frameStackStride=1, frameStackSize=1) result = csm(self.query_feature, self.reference_feature) self.assertAlmostEqualMatrix(self.expected_sim_matrix_binary, result)
def save_separated_audiofiles(self): # check and see if directory exists if self.directory != "" and ("/" in self.directory): directoryLevels = self.directory.split("/") for ixLevel, directoryLevel in enumerate(directoryLevels): if ixLevel == 0: LevelPath = directoryLevel else: LevelPath += "/" + directoryLevel if not os.path.isdir(LevelPath): os.mkdir(LevelPath) # Create audio writer object if self.fsIsSpecified == "False": # Notify user if sampling rate not specified print( "Sample Rate not specified for writing the audio files. Assumed Fs (Hz) is " + str(self.fs)) if self.formatIsSpecified == "False": # Notify user if format not specified print( "File format not specified for writing the audio files. Assumed format is " + str(self.format)) MonoWriter = es.MonoWriter(sampleRate=self.fs, format=self.format) MonoWriter.configure(filename=self.directory + self.filename + "_percussive." + self.format) MonoWriter(array(self.x_p)) MonoWriter = es.MonoWriter(sampleRate=self.fs, format=self.format) MonoWriter.configure(filename=self.directory + self.filename + "_harmonic." + self.format) MonoWriter(array(self.x_h))
def extract_features(x, M=Config.WINDOW_SIZE, N=Config.FFT_SIZE, H=Config.HOP_SIZE, fs=Config.FS, window_type=Config.WINDOW_TYPE): ''' Function that extracts spectrogram from an audio signal ----------------------- Input: Samples, window size (int), FFT size (int), Hop size (int), Sampling rate, Window type (e.g. Hanning) Output: Spectrogram ----------------------- ''' # init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=window_type) SP = [] # compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): # generate frames wX = window(frame) # window frame mX = spectrum(wX) # compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) # power law compression SP = SP[:, :int(Config.FFT_SIZE / 4 + 1)] return SP
def callback(in_data, frame_count, time_info, status): global abuffer, recording, identifying, resampling, \ ncalls, act_status, collect, nturnoff, buffer_size global wf,ratei,odir,model,keys,filename # If there is nothing to do if not identifying: return (in_data,pyaudio.paContinue) # If there is something to do in_data = np.fromstring(in_data, dtype='Int16') in_data = in_data/32767.0 if resampling != 1.0: in_data=resample(in_data,in_data*resmpling) abuffer.append(in_data) if len(abuffer)>buffer_size: in_data = np.concatenate(abuffer[-buffer_size:]) if identifying: # AQUI VA EL CODIGO DE # Extraer MFCC # Extraer estadísticas # Predecir mfccs =[] audio = essentia.array(in_data) for frame in FrameGenerator(audio, frameSize = 2048 , hopSize = 1024): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = essentia.array(mfccs).T stats = Stats_Es(mfccs) print stats.shape print model.predict(stats) abuffer.pop(len(abuffer)-buffer_size) if not stop: return (in_data,pyaudio.paContinue) else: return (in_data,pyaudio.paComplete)
def extract_features(x, M=WINDOW_SIZE, N=FFT_SIZE, H=HOP_SIZE, fs=SR, window_type=WINDOW_TYPE): ''' extract magnitudes spectra from input vector apply power-law compression cutt the upper spectrum ''' #init functions and vectors x = essentia.array(x) spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type=WINDOW_TYPE) SP = [] #compute STFT for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): #generate frames wX = window(frame) #window frame mX = spectrum(wX) #compute fft SP.append(mX) SP = essentia.array(SP) SP = np.power(SP, 2. / 3.) #power law compression #SP = SP[:,:int(FFT_SIZE/2+1)] #cut upper spectrum (above 4 khz) return SP
def extractHPCP(audiosignal, frameSize, hopSize, w, speaks, hpcp, signalname): # w is the preconfigured windowing algorithm # hpcp is the preconfigured HPCP algorithm audio = essentia.array(audiosignal) # TODO: not sure if this is necessary: if len(audio) % 2: audio = audio[:-1] spectrum = Spectrum() speaks.maxFrequency = hpcp.paramValue('maxFrequency') chromagram = [] spectrogram = [] signal_spectrum = spectrum(audio) for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): frame_spectrum = spectrum(w(frame)) spectrogram.append(frame_spectrum) pfreq, pmagn = speaks(frame_spectrum) chromagram.append(hpcp(pfreq, pmagn)) spectrogram = essentia.array(spectrogram).T chromagram = essentia.array(chromagram).T hpcp_mean = np.mean(chromagram, axis=1) hpcp_median = np.median(chromagram, axis=1) return chromagram, spectrogram, signal_spectrum, hpcp_mean, hpcp_median
def calc_chromagram(self): # save the results in the stft_pool self.chromagram = [] hpcp = es.HPCP( size=12, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, weightType='cosine', nonLinear=False, windowSize=1., sampleRate=self.sample_rate) spectrum = es.Spectrum(size=self.fft_size) spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate) for frame in es.FrameGenerator(self.audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frame = array(frame * self.window) freqs, mags = spectral_peaks(spectrum(frame)) chroma = hpcp(freqs, mags) self.chromagram.append(chroma) self.chromagram = array(self.chromagram) self.timeAxSec = np.arange(len( self.chromagram)) * self.hop_size / float(self.sample_rate)
def save_features(key, pool, mfcc, hpcp, tonnetz): """Saves the features into the specified pool under the given key.""" [pool.add(key + ".mfcc", essentia.array(mfcc_coeff)) for mfcc_coeff in mfcc] [pool.add(key + ".hpcp", essentia.array(hpcp_coeff)) for hpcp_coeff in hpcp] [pool.add(key + ".tonnetz", essentia.array(tonnetz_coeff)) for tonnetz_coeff in tonnetz]
def main(): args = parse_args() metadata = get_metadata(args.filename) audio = load_partial_audio(args.filename, args.start_time, args.end_time) sample_seconds = get_duration(audio, metadata.sampleRate) audio = resample_audio(audio, metadata.sampleRate, PLOT_SAMPLE_RATE) if args.energy_buckets: _, audio = energy_buckets(audio, sample_seconds) print "Created %d energy points" % len(audio) audio = essentia.array(audio) if args.amplitude: print 'Amplitude' audio = essentia.array(numpy.absolute(audio)) write_raw(audio, 'output/raw_audio.csv') x = essentia.array(linspace(0, sample_seconds, len(audio))) if args.spline: print 'Spline' f = get_spline_function(x, audio) audio = essentia.array(numpy.vectorize(lambda x: f(x)[0])(audio)) if args.moving_max: print 'Moving Max' audio = essentia.array(moving_max(audio, window_size=50)) if args.moving_average: print 'Moving Average' audio = moving_average(audio, size=6) plot(x, audio) if args.find_peaks: # print beat ticks print 'Find Peaks' ticks = find_peaks(audio) * max(x) for tick in ticks: axvline(tick, ymin=0, ymax=0.1, color='red') if args.gradient: print 'Gradient' gradient = get_gradient(audio) write_raw(gradient, 'output/gradient.csv') plot(x, gradient, color="yellow") savefig('output/waveform') close() if args.spectrum: print 'Spectrum' spec = spectrum(audio) write_raw(spec, 'output/raw_spec.csv') plot(arange(len(spec)), spec) savefig('output/spectrum') close()
def compute(audio): """ filters out maxs values corresponding to harmonic part """ audio = essentia.array(audio) sampleRate = int(conf.opts['sampleRate']) frameSize = int(conf.opts['frameSize']) hopSize = int(conf.opts['hopSize']) zeroPadding = int(conf.opts['zeroPadding']) windowType = conf.opts['windowType'] frameRate = float(sampleRate)/float(hopSize) whitenf = Whitener(sampleRate=sampleRate,peaksNumber=opts["numPeaks"],hopSize=hopSize,frameSize=frameSize) audio = whitenf(audio) # # frames = FrameGenerator(audio = audio, frameSize = frameSize, hopSize = hopSize) # window = Windowing(size = frameSize, zeroPadding = zeroPadding, type = windowType) # fft = FFT() # ifft = IFFT() # cartesian2polar = CartesianToPolar() # polar2cartesian = PolarToCartesian() # whitef = SpectralWhitening(sampleRate = sampleRate) # peaksf = SpectralPeaks(sampleRate = sampleRate,maxPeaks=5) # # audioout=np.zeros(len(audio)) # # # # total_frames = frames.num_frames() # n_frames = 0 # start_of_frame =0 # # for frame in frames: # # windowed_frame = window(frame) # complex_fft = fft(windowed_frame) # (spectrum,phase) = cartesian2polar(complex_fft) # peaks,mags =peaksf(spectrum) # whited = whitef(spectrum,peaks,mags) # i=0 # for p in peaks: # spectrum[int(p*frameSize/sampleRate)]=whited[i] # i+=1 # # # complex_fft=polar2cartesian(spectrum,phase) # outf = ifft(complex_fft)*.5*hopSize # if start_of_frame+frameSize < len(audio) and start_of_frame>0: # audioout[start_of_frame:start_of_frame+frameSize]+=window(outf) # # n_frames += 1 # start_of_frame += hopSize # return essentia.array(audio)
def extractEnvelopeSegments(audio): pd = PeakDetection(orderBy='amplitude') duration = Duration() midpoint, _ = pd(audio) slicer = Slicer(startTimes=essentia.array([0, midpoint[0]*duration(audio)]), endTimes=essentia.array([midpoint[0]*duration(audio), duration(audio)])) slices = slicer(audio) # XXX: ugly return ensureEven(slices[0]), ensureEven(slices[1])
def compute(audio, pool, options): INFO('Computing Inter Onsets Intervals...') sampleRate = options['sampleRate'] bpm = pool.value('rhythm.bpm') onsets = pool.value('rhythm.onset_times') # special case if bpm < 0 or len(onsets) < 2: pool.add(namespace + '.' + 'relative_ioi_peaks', [float()])#, pool.GlobalScope) pool.add(namespace + '.' + 'relative_ioi', [float()])#, pool.GlobalScope) INFO('100% done...') return # 32th note interval interp = 32. interval = (60./bpm) / interp riois = [] old = onsets[0] for i in range(1,len(onsets)): riois += [ round( (onsets[i] - onsets[i-1]) / interval ) ] for i in range(2,len(onsets)): riois += [ round( (onsets[i] - onsets[i-2]) / interval ) ] for i in range(3,len(onsets)): riois += [ round( (onsets[i] - onsets[i-3]) / interval ) ] for i in range(4,len(onsets)): riois += [ round( (onsets[i] - onsets[i-4]) / interval ) ] ioidist = essentia.array(bincount(riois)) fullioidist = essentia.array(zip( [p/interp for p in range(len(ioidist))], [ioi/sum(ioidist) for ioi in ioidist])) fullioidist = fullioidist[0:interp*5] peak_detection = essentia.PeakDetection(minPosition = 0., maxPosition = len(ioidist), maxPeaks = 5, range = len(ioidist) - 1., interpolate = True, orderBy = 'amplitude') pos, mags = peak_detection(ioidist) # scale back to 1 beat pos = [ p/interp for p in pos ] # ratio across whole distribution surface mags = [ mag/sum(ioidist) for mag in mags ] # add to pool pool.add(namespace + '.' + 'relative_ioi_peaks', essentia.array(zip(pos,mags)))#, pool.GlobalScope) pool.add(namespace + '.' + 'relative_ioi', fullioidist)#, pool.GlobalScope) # debug plot if 0: from pylab import plot, show, hold plot([i/interp for i in range(len(ioidist))], [ioi/sum(ioidist) for ioi in ioidist],'b+-') hold(True) for i,j in zip(pos,mags): plot([i]*2,[0.,j],'+-') hold(False) show() INFO('100% done...')
def handle(self, audio: np.array): left, right = self.to_stereo(audio) diff = (left == right) mul_array = (np.array([self.mul] * len(audio)) * (diff - 1)) + 1 left = left * mul_array right = right * mul_array left = essentia.array(left) right = essentia.array(right) return self.to_mono(left, right)
def extract_mfcc(audio): w = Windowing(type = 'blackmanharris62') spectrum = Spectrum() mfcc = essentia.standard.MFCC() mfccs =[] audio = essentia.array(audio) for frame in FrameGenerator(audio, frameSize = 2048 , hopSize = 1024): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = essentia.array(mfccs).T return mfccs
def extract_mfcc(audio): w = Windowing(type='blackmanharris62') spectrum = Spectrum() mfcc = essentia.standard.MFCC() mfccs = [] audio = essentia.array(audio) for frame in FrameGenerator(audio, frameSize=2048, hopSize=1024): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = essentia.array(mfccs).T return mfccs
def compute(features, opt): frameRate = opt["sampleRate"] / opt["hopSize"] delay = int(conf.opts["doubleOnsetT"] / 2.0 * frameRate) onsets = Onsets(frameRate=frameRate, alpha=opts["alpha"], delay=delay, silenceThreshold=opts["silenceThresh"]) if isinstance(features[0], list) or isinstance(features[0], np.ndarray): weights = essentia.array([1 for x in range(len(features))]) time_onsets = list(onsets(essentia.array(features), weights)) else: time_onsets = list(onsets(essentia.array([features]), essentia.array([1]))) return time_onsets
def compute(audio): """ compress/expand """ f = waveshaper(xPoints=essentia.array(opts["LUTx"]), yPoints=essentia.array(opts["LUTy"]), normalize = True if opts["normalize"] else False, spline = True if opts["spline"] else False, ) audio = f(essentia.array(audio)) return audio
def extractOnsets(audio): od1 = OnsetDetection(method = 'hfc') od2 = OnsetDetection(method = 'complex') # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = essentia.Pool() # let's get down to business for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.hfc', od1(mag, phase)) pool.add('features.complex', od2(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() onsets_hfc = onsets(# this algo expects a matrix, not a vector array([ pool['features.hfc'] ]), # you need to specify weights, but as there is only a single # function, it doesn't actually matter which weight you give it [ 1 ]) # np.savetxt(outFile, onsets_hfc, fmt='%f') #Let's just take the complex as an example onsets_complex = onsets(array([ pool['features.complex'] ]), [ 1 ]) startTimes = onsets_hfc endTimes = onsets_hfc[1:] duration = Duration() endTimes = np.append(endTimes, duration(audio)) slicer = Slicer(startTimes = array(startTimes), endTimes = array(endTimes)) frames = slicer(audio) lengthInFrames = 0 for i in range(len(frames)): lengthInFrames = lengthInFrames + len(frames[i]) format = Format('wav') global counter f = Sndfile('out'+ str(counter) + '.wav' , 'w', format, 1, 44100) counter = counter + 1 f.write_frames(np.asarray(frames[0])) return frames
def compute(audio): """ filters out maxs values corresponding to harmonic part """ audio = essentia.array(audio) if opts['ratio'] == 0: return audio else: sampleRate = int(conf.opts['sampleRate']) frameSize = int(conf.opts['frameSize']) hopSize = frameSize / 4 zeroPadding = int(conf.opts['zeroPadding']) windowType = conf.opts['windowType'] frameRate = float(sampleRate) / float(hopSize) frames = FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = Windowing(size=frameSize, zeroPadding=zeroPadding, type=windowType) fft = FFT() ifft = IFFT() cartesian2polar = CartesianToPolar() polar2cartesian = PolarToCartesian() audioout = np.zeros(len(audio)) total_frames = frames.num_frames() n_frames = 0 start_of_frame = 0 for frame in frames: windowed_frame = window(frame) complex_fft = fft(windowed_frame) (spectrum, phase) = cartesian2polar(complex_fft) sortedS = np.sort(spectrum) minFloor = sortedS[int(len(sortedS) * (1. - opts["ratio"]))] #spectrum = essentia.array([0 if x>minFloor else x for x in spectrum ]) complex_fft = polar2cartesian(spectrum, phase) outf = ifft(complex_fft) * .5 * hopSize if start_of_frame + frameSize < len(audio) and start_of_frame > 0: audioout[start_of_frame:start_of_frame + frameSize] += window(outf) n_frames += 1 start_of_frame += hopSize return essentia.array(audioout)
def extractEnvelopeSegments(audio): pd = PeakDetection(orderBy='amplitude') duration = Duration() midpoint, _ = pd(audio) slicer = Slicer(startTimes=essentia.array( [0, midpoint[0] * duration(audio)]), endTimes=essentia.array( [midpoint[0] * duration(audio), duration(audio)])) slices = slicer(audio) # XXX: ugly return ensureEven(slices[0]), ensureEven(slices[1])
def compute(audio): """ compress/expand """ f = waveshaper( xPoints=essentia.array(opts["LUTx"]), yPoints=essentia.array(opts["LUTy"]), normalize=True if opts["normalize"] else False, spline=True if opts["spline"] else False, ) audio = f(essentia.array(audio)) return audio
def compute(audio): """ filters out maxs values corresponding to harmonic part """ audio = essentia.array(audio) if opts['ratio']==0: return audio else: sampleRate = int(conf.opts['sampleRate']) frameSize = int(conf.opts['frameSize']) hopSize = frameSize/4 zeroPadding = int(conf.opts['zeroPadding']) windowType = conf.opts['windowType'] frameRate = float(sampleRate)/float(hopSize) frames = FrameGenerator(audio = audio, frameSize = frameSize, hopSize = hopSize) window = Windowing(size = frameSize, zeroPadding = zeroPadding, type = windowType) fft = FFT() ifft = IFFT() cartesian2polar = CartesianToPolar() polar2cartesian = PolarToCartesian() audioout=np.zeros(len(audio)) total_frames = frames.num_frames() n_frames = 0 start_of_frame =0 for frame in frames: windowed_frame = window(frame) complex_fft = fft(windowed_frame) (spectrum,phase) = cartesian2polar(complex_fft) sortedS = np.sort(spectrum) minFloor = sortedS[int(len(sortedS)*(1.-opts["ratio"]))] #spectrum = essentia.array([0 if x>minFloor else x for x in spectrum ]) complex_fft=polar2cartesian(spectrum,phase) outf = ifft(complex_fft)*.5*hopSize if start_of_frame+frameSize < len(audio) and start_of_frame>0: audioout[start_of_frame:start_of_frame+frameSize]+=window(outf) n_frames += 1 start_of_frame += hopSize return essentia.array(audioout)
def getOnsetFunctions(fname): logger = log.get_logger("rhythm") zeropadLen = params.Nfft - params.frmSize zz = np.zeros((zeropadLen, ), dtype='float32') frameCounter = 0 bufferFrame = np.zeros((params.Nfft / 2 + 1, )) logger.info('Reading audio file...') audio = ess.MonoLoader(filename=fname)() fft = ess.FFT(size=params.Nfft) # this gives us a complex FFT c2p = ess.CartesianToPolar( ) # and this turns it into a pair (magnitude, phase) pool = es.Pool() w = ess.Windowing(type="hamming") fTicks = params.fTicks poolName = 'features.flux' logger.info('Extracting Onset functions...') for frame in ess.FrameGenerator(audio, frameSize=params.frmSize, hopSize=params.hop): frmTime = params.hop / params.Fs * frameCounter + params.frmSize / ( 2.0 * params.Fs) zpFrame = np.hstack((frame, zz)) mag, phase, = c2p(fft(w(zpFrame))) magFlux = mag - bufferFrame bufferFrame = np.copy( mag) # Copying for the next iteration to compute flux for bands in range(params.numBands): chosenInd = (fTicks >= params.fBands[bands, 0]) & ( fTicks <= params.fBands[bands, 1]) magFluxBand = magFlux[chosenInd] magFluxBand = (magFluxBand + abs(magFluxBand)) / 2 oFn = magFluxBand.sum() if (math.isnan(oFn)): print("NaN found here") pass pool.add(poolName + str(bands), oFn) pass pool.add('features.time', frmTime) frameCounter += 1 if not np.mod(frameCounter, 10000): logger.info( str(frameCounter) + '/' + str(audio.size / params.hop) + '...') logger.info('Total frames processed = ' + str(frameCounter)) timeStamps = es.array([pool['features.time']]) all_feat = timeStamps for bands in range(params.numBands): feat_flux = es.array([pool[poolName + str(bands)]]) all_feat = np.vstack((all_feat, feat_flux)) pass return np.transpose(all_feat)
def main(): args = parse_args() audios = [] for filename in args.filename: metadata = get_metadata(filename) audio = load_partial_audio(filename, args.start_time, args.end_time) sample_seconds = get_duration(audio, metadata.sampleRate) audio = resample_audio(audio, metadata.sampleRate, PLOT_SAMPLE_RATE) _, audio = energy_buckets(audio, sample_seconds) x = essentia.array(linspace(0, sample_seconds, len(audio))) print "Created %d energy points" % len(audio) audio = essentia.array(audio) audios.append((x, audio)) plotting.plot_lines(audios, args.filename, args.output)
def algorithm_durations(sound): """ Returns the duration of a file according to its length in number of samples and according to an envelope computation (See FFont ismir paper TODO: cite correctly). :param sound: sound dictionary from dataset :return: dictionary with results per different methods """ results = dict() sample_rate = 44100 n_channels = 1 audio = load_audio_file(file_path=sound[SOUND_FILE_KEY], sample_rate=sample_rate) length_samples = len(audio) duration = float(len(audio))/(sample_rate * n_channels) # NOTE: load_audio_file will resample to the given sample_rate and downmix to mono # Effective duration env = estd.Envelope(attackTime=10, releaseTime=10) envelope = env(essentia.array(audio)) threshold = envelope.max() * 0.05 envelope_above_threshold = np.where(envelope >= threshold) start_effective_duration = envelope_above_threshold[0][0] end_effective_duration = envelope_above_threshold[0][-1] length_samples_effective_duration = end_effective_duration - start_effective_duration results['durations'] = { 'duration': duration, 'length_samples': length_samples, 'length_samples_effective_duration': length_samples_effective_duration, 'start_effective_duration': start_effective_duration, 'end_effective_duration': end_effective_duration } return results
def serra_cover_similarity_measures(input_crp, disOnset=0.5, disExtension=0.5, simType='qmax'): """ Computes distance cover song similarity measure using smith-waterman local allignment from the cross recurrent plots as mentioned in [1] (qmax) and [2] (dmax) [1]. Serra, J., Serra, X., & Andrzejak, R. G. (2009). Cross recurrence quantification for cover song identification. New Journal of Physics, 11. [2]. Chen, N., Li, W., & Xiao, H. (2017). Fusing similarity functions for cover song identification. Multimedia Tools and Applications. Input: input_crp: 2-d binary matrix of cross recurrent plot (x-axis query song and y-axis for reference song) Params: disOnset: penalty for a disurption onset disExtension: penalty for a disurption extension simType: ['qmax', 'dmax'] Return: cover similarity distance NOTE: CoverSongSimilarity algo will be available soon in the new essentia release """ coversim = CoverSongSimilarity(disOnset=disOnset, disExtension=disExtension, simType=simType) score_matrix = coversim.compute(array(input_crp)) return np.divide(np.sqrt(input_crp.shape[1]), np.max(score_matrix))
def compute_essentia_descriptors(audio_segment, actual_bar_beg, actual_bar_end): """ Computes the values of selected descriptors in the given audio segment. """ frames = FrameGenerator(audio_segment, frameSize=frameSize, hopSize=hopSize) mfccs_bar = [] bark_vector = [0] * 27 pool = essentia.Pool() total_frames = frames.num_frames() for frame in frames: frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) (frame_frequencies, frame_magnitudes) = spectralPeaks(frame_spectrum) mag, phase, = c2p(fft(frame_windowed)) pool.add('onsets.hfc', od(mag, phase)) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add('dissonance', frame_dissonance) # pool.add('zerocrossingrate', zerocrossingrate(frame)) mfcc_bands, mfcc_coeffs = mfcc(spectrum(window(frame))) mfccs_bar.append(mfcc_coeffs) frame_barkbands = barkbands(frame_spectrum) for i in range(27): bark_vector[i] += frame_barkbands[i] / total_frames onsets_hfc = onsets(essentia.array([pool['onsets.hfc']]), [1]) onset_rate = float(len(onsets_hfc)) / (actual_bar_end - actual_bar_beg) bar_dissonance = mean(pool["dissonance"]) return mfccs_bar, bark_vector, onset_rate, bar_dissonance
def tuningSystemFeatures(pool, namespace=''): # expects tonal descriptors and tuning features to be in pool tonalspace = 'tonal.' if namespace: tonalspace = namespace + '.tonal.' # 1-diatonic strength hpcp_highres = normalize(numpy.mean(pool[tonalspace + 'hpcp_highres'], 0)) key, scale, strength, _ = standard.Key( profileType='diatonic')(hpcp_highres) pool.set(tonalspace + 'tuning_diatonic_strength', strength) # 2- high resolution features eqTempDeviation, ntEnergy, _ = standard.HighResolutionFeatures()( hpcp_highres) pool.set(tonalspace + 'tuning_equal_tempered_deviation', eqTempDeviation) pool.set(tonalspace + 'tuning_nontempered_energy_ratio', ntEnergy) # 3- THPCP hpcp = normalize(numpy.mean(pool[tonalspace + 'hpcp'], 0)) hpcp_copy = hpcp[:] idx = numpy.argmax(hpcp) offset = len(hpcp) - idx hpcp[:offset] = hpcp_copy[idx:offset + idx] hpcp[offset:offset + idx] = hpcp_copy[0:idx] pool.set(tonalspace + 'thpcp', essentia.array(hpcp))
def process(self, frames, eod=False): if not eod: w_frame = self.windower(essentia.array(frames.squeeze())) spectrum = self.spec_alg(w_frame) spec, mags = self.spec_peaks_alg(spectrum) self.dissonance.append(self.dissonance_alg(spec, mags)) return frames, eod
def callback(data): # update audio buffer buffer[:] = array(unpack('f' * bufferSize, data)) # generate predictions reset(vimp) run(vimp)
def extractBeats(self, fileName): """Use a beattracker to return beat locations :param fileName: the file to load and extract beats from :return: ticks: the times in the file slices: the segmented audio units fileName: pass out the filename again """ slices = None ticks = None beatTracker = essentia.standard.BeatTrackerDegara() duration = essentia.standard.Duration() if fileName: audio = self.loadAudio(fileName) ticks = beatTracker(audio) endTimes = ticks[1:] d = duration(audio) endTimes = np.append(endTimes, d) endTimes = essentia.array(endTimes) # slicer = essentia.standard.Slicer(startTimes=onsetTimes, endTimes=endTimes) # slices = slicer(audio) slices = self.slice(ticks, audio) return ticks, slices, fileName
def loadAudio(self, filename): """Load audio from a filename and return the audio vector :param filename: input filename :return: audio signal """ audio = None if filename: # loader = essentia.standard.MonoLoader(filename=filename) # # # and then we actually perform the loading: # audio = loader() #Essentia's loader (above) has a bug that doesn't close files #It causes problems processing large number of files, use madmom instead # audio, sample_rate = madmom.audio.signal.load_wave_file(filename, num_channels=1) y, sr = librosa.load(filename, sr=None) audio = essentia.array(y) return audio
def mean_scope(self, scopeFrom, scopeTo): descriptors_mean = {} for key in self.descriptors.keys(): descriptor = self.descriptors[self.__currentNamespace][key] values_in_scope = [] # Global descriptor (should also check that scope spans the entire file) if len(descriptor['values']) == 1: descriptors_mean[key] = descriptor['values'][0] continue for scope, value in zip(descriptor['scopes'], descriptor['values']): if scope[0] >= scopeFrom and scope[1] <= scopeTo: values_in_scope.append(value) if len(values_in_scope) > 0: try: descriptors_mean[key] = essentia.array( numpy.mean(values_in_scope, axis=0)) except TypeError: # values are not numeric descriptors_mean[key] = values_in_scope[0] return descriptors_mean
def get_onsets(self, in_filename): # print in_filename # Load the audio (in mono) audio, sampleRate, numChan = AudioLoader(filename=in_filename)() audio = MonoLoader(filename=in_filename)() self.sampleRate = sampleRate # 1) Compute onset detection functions od = OnsetDetection(method='rms') w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() pool_features = Pool() # print 'Computing onset detection functions' for frame in FrameGenerator(audio, frameSize=self.frame_size, hopSize=self.hop_size): mag, phase = c2p(fft(w(frame))) pool_features.add('features.rms', od(mag, phase)) # 2) Compute the onset locations onsets = Onsets(silenceThreshold=0.14, delay=10) # print 'Computing onset locations' onsets_rms = onsets( array([ pool_features['features.rms'] ]), [ 1 ]) print "Num onsets: " + str(len(onsets_rms)) return onsets_rms
def extractor(filename): PREEMPH = 0.97 fs = 44100 audio = ess.MonoLoader(filename = filename, sampleRate = fs)() # dynamic range expansion as done in HTK implementation audio = audio*2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize= fftSize//2+1 zeroPadding = fftSize - frameSize w = ess.Windowing(type = 'hamming', # corresponds to htk default USEHAMMING = T size = frameSize, zeroPadding = zeroPadding, normalized = False, zeroPhase = False) spectrum = ess.Spectrum(size = fftSize) mfcc_htk = ess.MFCC(inputSize = spectrumSize, type = 'magnitude', # htk uses mel filterbank magniude warpingFormula = 'htkMel', # htk's mel warping formula weighting = 'linear', # computation of filter weights done in Hz domain highFrequencyBound = 8000, # corresponds to htk default lowFrequencyBound = 0, # corresponds to htk default numberBands = 26, # corresponds to htk default NUMCHANS = 26 numberCoefficients = 13, normalize = 'unit_max', # htk filter normaliation to have constant height = 1 dctType = 3, # htk uses DCT type III logType = 'log', liftering = 22) # corresponds to htk default CEPLIFTER = 22 preemph_filter = ess.IIR(numerator=[1-PREEMPH]) mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize , startFromZero = True, validFrameThresholdRatio = 1): frame = frame - np.mean(frame) # if ENORMALISE = T frame_doubled_first = np.insert(frame,0,frame[0]) ##### if PREEMPHASIS needed preemph_frame = preemph_filter(frame_doubled_first) frame = preemph_frame[1:] spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T mfccs = essentia.array(mfccs).T # and plot plt.imshow(mfccs[1:,:], aspect = 'auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def file_to_hpcp(loop): loop = e.array(loop) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() spectral_peaks = es.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = es.HPCP(maxFrequency=8000) spec_group = [] hpcp_group = [] for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) #normalize to 1 mean_hpcp = mean_hpcp / mean_hpcp.max() return mean_hpcp
def clicke(length): short = np.zeros(int(44100 * (length / 1000.))) short[0] = 1.0 decay = length / 6 env = Envelope() env.configure(attackTime=0, releaseTime=decay) return env(essentia.array(short))
def detect_peaks(self, min_peak_ratio=0.15): """-------------------------------------------------------------------- Finds the peak indices of the distribution. These are treated as tonic candidates in higher order functions. min_peak_ratio: The minimum ratio between the max peak value and the value of a detected peak --------------------------------------------------------------------""" assert 1 >= min_peak_ratio >= 0, \ 'min_peak_ratio should be between 0 (keep all peaks) and ' \ '1 (keep only the highest peak)' # Peak detection is handled by Essentia detector = std.PeakDetection() peak_bins, peak_vals = detector(essentia.array(self.vals)) # Essentia normalizes the positions to 1, they are converted here # to actual index values to be used in bins. peak_inds = np.array([int(round(bn * (len(self.bins) - 1))) for bn in peak_bins]) # if the object is pcd and there is a peak at zeroth index, # there will be another in the last index. Since a pcd is circular # remove the lower value if self.is_pcd() and peak_inds[0] == 0: if peak_vals[0] >= peak_vals[-1]: peak_inds = peak_inds[:-1] peak_vals = peak_vals[:-1] else: peak_inds = peak_inds[1:] peak_vals = peak_vals[1:] # remove peaks lower than the min_peak_ratio peak_bool = peak_vals / max(peak_vals) >= min_peak_ratio return peak_inds[peak_bool], peak_vals[peak_bool]
def extractPredominantMelody(audio_URI, frameSize=None, hopSize=None): ''' extract predominant melody with melodia to reduce dependency copied from https://github.com/georgid/AlignmentDuration/blob/noteOnsets/src/align/FeatureExtractor.py audio_URI: string full file URI with extension ''' from essentia.standard import PredominantPitchMelodia fs = 44100 vTol = 1.4 loader = essentia.standard.MonoLoader(filename= audio_URI ) audioSamples = loader() # extract f0 using ESSENTIA input = essentia.array(audioSamples) pitchTracker = PredominantPitchMelodia(frameSize = frameSize, hopSize = hopSize, sampleRate = fs, voicingTolerance = vTol, voiceVibrato = False, filterIterations=10, peakDistributionThreshold=0.9, guessUnvoiced=True) # pitchTracker = PredominantMelody(frameSize = wSize, hopSize = hSize, sampleRate = fs, # voicingTolerance = vTol, voiceVibrato = False, filterIterations=10, # peakDistributionThreshold=0.9, guessUnvoiced=True) f0, pitchConf = pitchTracker(input) timestamps = calc_TimeStamps(audioSamples, f0, frameSize, fs) est_freq_and_ts = np.array(zip(timestamps, f0)) return est_freq_and_ts
def compute(features, opt): frameRate = opt['sampleRate'] / opt['hopSize'] delay = int(conf.opts["doubleOnsetT"] / 2. * frameRate) onsets = Onsets(frameRate=frameRate, alpha=opts["alpha"], delay=delay, silenceThreshold=opts["silenceThresh"]) if isinstance(features[0], list) or isinstance(features[0], np.ndarray): weights = essentia.array([1 for x in range(len(features))]) time_onsets = list(onsets(essentia.array(features), weights)) else: time_onsets = list( onsets(essentia.array([features]), essentia.array([1]))) return time_onsets
def detect_peaks(self): detector = std.PeakDetection() peak_bins, peak_vals = detector(essentia.array(self.vals)) # Essentia normalizes the positions to 1 peak_idxs = [round(bn * (len(self.bins) - 1)) for bn in peak_bins] if(peak_idxs[0] == 0): peak_idxs = np.delete(peak_idxs, [len(peak_idxs) - 1]) peak_vals = np.delete(peak_vals, [len(peak_vals) - 1]) return peak_idxs, peak_vals
def compute_all_features(audio_file, audio_beats=False): """Computes all the features for a specific audio file and its respective human annotations. Returns ------- features : dict Dictionary with the following features: mfcc : np.array Mel Frequency Cepstral Coefficients representation hpcp : np.array Harmonic Pitch Class Profiles tonnets : np.array Tonal Centroids (or Tonnetz) """ # Makes sure the output features folder exists utils.ensure_dir(OUTPUT_FEATURES) features_file = os.path.join(OUTPUT_FEATURES, os.path.basename(audio_file) + ".json") # If already precomputed, read and return if os.path.exists(features_file): with open(features_file, "r") as f: features = json.load(f) return list_to_array(features) # Load Audio logging.info("Loading audio file %s" % os.path.basename(audio_file)) audio = ES.MonoLoader(filename=audio_file, sampleRate=SAMPLE_RATE)() duration = len(audio) / float(SAMPLE_RATE) # Estimate Beats features = {} ticks, conf = compute_beats(audio) ticks = np.concatenate(([0], ticks, [duration])) # Add first and last time ticks = essentia.array(np.unique(ticks)) features["beats"] = ticks.tolist() # Compute Beat-sync features features["mfcc"], features["hpcp"], features["tonnetz"] = \ compute_beatsync_features(ticks, audio) # Save output as audio file if audio_beats: logging.info("Saving Beats as an audio file") marker = ES.AudioOnsetsMarker(onsets=ticks, type='beep', sampleRate=SAMPLE_RATE) marked_audio = marker(audio) ES.MonoWriter(filename='beats.wav', sampleRate=SAMPLE_RATE)(marked_audio) # Save features with open(features_file, "w") as f: json.dump(features, f) return list_to_array(features)
def getOnsetFunctions(fname): logger = log.get_logger("rhythm") zeropadLen = params.Nfft - params.frmSize zz = np.zeros((zeropadLen,),dtype = 'float32') frameCounter = 0 bufferFrame = np.zeros((params.Nfft/2+1,)) logger.info('Reading audio file...') audio = ess.MonoLoader(filename = fname)() fft = ess.FFT(size = params.Nfft) # this gives us a complex FFT c2p = ess.CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = es.Pool() w = ess.Windowing(type = "hamming") fTicks = params.fTicks poolName ='features.flux' logger.info('Extracting Onset functions...') for frame in ess.FrameGenerator(audio, frameSize = params.frmSize, hopSize = params.hop): frmTime = params.hop/params.Fs*frameCounter + params.frmSize/(2.0*params.Fs) zpFrame = np.hstack((frame,zz)) mag, phase, = c2p(fft(w(zpFrame))) magFlux = mag - bufferFrame bufferFrame = np.copy(mag) # Copying for the next iteration to compute flux for bands in range(params.numBands): chosenInd = (fTicks >= params.fBands[bands,0]) & (fTicks <= params.fBands[bands,1]) magFluxBand = magFlux[chosenInd] magFluxBand = (magFluxBand + abs(magFluxBand))/2 oFn = magFluxBand.sum() if (math.isnan(oFn)): print "NaN found here" pass pool.add(poolName + str(bands), oFn) pass pool.add('features.time', frmTime); frameCounter += 1 if not np.mod(frameCounter,10000): logger.info(str(frameCounter) + '/' + str(audio.size/params.hop) + '...') logger.info('Total frames processed = ' + str(frameCounter)) timeStamps = es.array([ pool['features.time'] ]) all_feat = timeStamps for bands in range(params.numBands): feat_flux = es.array([ pool[poolName + str(bands)] ]) all_feat = np.vstack((all_feat,feat_flux)) pass return np.transpose(all_feat)
def getLPF(signalNeeded): midi_note = int(float(signalNeeded[-2:])) fc = 440.*np.power(semitone, midi_note) LPF = LowPass(sampleRate=fs, cutoffFrequency=fc) signal = essentia.array(LPF(getWhiteNoise())) # Normalization signal = signal / max(signal) return signal
def detect_essentia(arquivo_audio,selected): #ODF using essentia library # try: filename = arquivo_audio except: print "usage:", sys.argv[0], "<audiofile>" sys.exit() # don't forget, we can actually instantiate and call an algorithm on the same line! global audio # Phase 1: compute the onset detection function # The OnsetDetection algorithm tells us that there are several methods available in Essentia, # let's do two of them if selected==3: od = OnsetDetection(method = 'hfc') elif selected==4: od = OnsetDetection(method = 'complex') elif selected==5: od = OnsetDetection(method = 'melflux') elif selected==6: od = OnsetDetection(method = 'complex_phase') elif selected==7: od = OnsetDetection(method = 'rms') # let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type = 'hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar() # and this turns it into a pair (magnitude, phase) pool = Pool() # let's get down to business print 'Computing onset detection functions...' for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = c2p(fft(w(frame))) pool.add('features.method', od(mag, phase)) # Phase 2: compute the actual onsets locations onsets = Onsets() print 'Computing onset times...' onsets_method = onsets(array([ pool['features.method'] ]), [ 1 ]) # and mark them on the audio, which we'll write back to disk # we use beeps instead of white noise to mark them, as it's more distinctive #convertendo para o tipo list listadet = onsets_method.tolist() #convertendo os segundos para frames listadet = [int(SecToFrames(x)) for x in listadet if x >= 0] return listadet
def SMDetect(l_deriv, alph, sTh): if len(l_deriv) > 1: ons = ess.Onsets(alpha=alph, silenceThreshold=sTh, frameRate=172) det = ons(essentia.array(l_deriv).reshape(1, len(l_deriv)), [1]) if len(det) > 1: return 1 else: return 0 else: return 0
def extract_mfcc(self,fname): w = Windowing(type = 'blackmanharris62') spectrum = Spectrum() mfcc = essentia.standard.MFCC() mfccs =[] loader=EasyLoader(filename=fname) audio = loader.compute() for frame in FrameGenerator(audio, frameSize = 2048 , hopSize = 1024): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = essentia.array(mfccs).T return mfccs
def frequenciesComp(bands, fmin, fmax,ffts,sampleRate=44100,a=440): """ Returns a list of frequencies aligned on a logarithmic scale. :param bands: number of filter bands per octave :param fmin: the minimum frequency [Hz] :param fmax: the maximum frequency [Hz] :param a: frequency of A0 [Hz] :returns: a list of frequencies Using 12 bands per octave and a=440 corresponding to the MIDI notes. """ if fmax > sampleRate / 2: fmax = sampleRate / 2 # factor 2 frequencies are apart factor = 2.0 ** (1.0 / bands) # start with A0 freq = a frequencies = [freq] # go upwards till fmax while freq <= fmax: # multiply once more, since the included frequency is a frequency # which is only used as the right corner of a (triangular) filter freq *= factor frequencies.append(freq) # restart with a and go downwards till fmin freq = a while freq >= fmin: freq /= factor frequencies.append(freq) # sort frequencies frequencies.sort() # return the list # conversion factor for mapping of frequencies to spectrogram bins factor = (sampleRate / 2.0) / ffts # map the frequencies to the spectrogram bins frequencies = np.round(np.asarray(frequencies) / factor).astype(int) # only keep unique bins frequencies = np.unique(frequencies) # filter out all frequencies outside the valid range frequencies = [f*factor for f in frequencies if f < ffts] return essentia.array(frequencies)
def compute(audio): audio = essentia.array(audio) sampleRate = int(conf.opts['sampleRate']) frameSize = int(conf.opts['frameSize']) hopSize = int(conf.opts['hopSize']) zeroPadding = int(conf.opts['zeroPadding']) windowType = conf.opts['windowType'] frameRate = float(sampleRate)/float(hopSize) INFO('Computing Onset Detection...') frames = FrameGenerator(audio = audio, frameSize = frameSize, hopSize = hopSize) window = Windowing(size = frameSize, zeroPadding = zeroPadding, type = windowType) nsdff = Nsdf() fftf = Spectrum() crestf = Crest() instPowf = InstantPower() total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize*0.5 progress = Progress(total = total_frames) cr = [] env = [] for frame in frames: windowed_frame = window(frame) nsdf = nsdff(frame) fftn = fftf(nsdf) cr += [crestf(fftn)] pow = instPowf(frame) if len(cr)>2 and pow<opts["minthresh"]: cr[-1]=cr[-2] n_frames += 1 start_of_frame += hopSize cr = np.array(cr) # w = signal.gaussian(3,1) # area = np.sum(w) # cr =np.convolve(cr,w , 'same') # cr = cr/(100.*area) return cr
def compute(audio): audio = essentia.array(audio) sampleRate = int(conf.opts["sampleRate"]) frameSize = int(conf.opts["frameSize"]) hopSize = int(conf.opts["hopSize"]) zeroPadding = int(conf.opts["zeroPadding"]) windowType = conf.opts["windowType"] frameRate = float(sampleRate) / float(hopSize) INFO("Computing Ess Detection...") frames = FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = Windowing(size=frameSize, zeroPadding=zeroPadding, type=windowType) fft = FFT() cartesian2polar = CartesianToPolar() onsetdetectionHFC = OnsetDetection(method="hfc", sampleRate=sampleRate) onsetdetectionComplex = OnsetDetection(method="complex", sampleRate=sampleRate) total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 hfc = [] complex = [] progress = Progress(total=total_frames) maxhfc = 0 for frame in frames: windowed_frame = window(frame) complex_fft = fft(windowed_frame) (spectrum, phase) = cartesian2polar(complex_fft) hfc.append(onsetdetectionHFC(spectrum, phase)) maxhfc = max(hfc[-1], maxhfc) complex.append(onsetdetectionComplex(spectrum, phase)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # The onset rate is defined as the number of onsets per seconds res = [[x / maxhfc for x in hfc]] res += [complex] return np.array(res)
def getBPF(signalNeeded): midi_note = int(float(signalNeeded[-2:])) fc0 = 440.*np.power(semitone, midi_note) fc1 = 440.*np.power(semitone, midi_note + 2*12 + 1) # 2 octaves bandwidth = fc1 - fc0 cutoffFrequency = (fc0 + fc1) / 2. BPF = BandPass(bandwidth=bandwidth, cutoffFrequency=cutoffFrequency,sampleRate=fs) signal = essentia.array(BPF(getWhiteNoise())) # Normalization signal = signal / max(signal) return signal
def detect_peaks(self): """------------------------------------------------------------------------- Finds the peak indices of the distribution. These are treated as tonic candidates in higher order functions. -------------------------------------------------------------------------""" # Peak detection is handled by Essentia detector = std.PeakDetection() peak_bins, peak_vals = detector(essentia.array(self.vals)) # Essentia normalizes the positions to 1, they are converted here # to actual index values to be used in bins. peak_idxs = [round(bn * (len(self.bins) - 1)) for bn in peak_bins] if(peak_idxs[0] == 0): peak_idxs = np.delete(peak_idxs, [len(peak_idxs) - 1]) peak_vals = np.delete(peak_vals, [len(peak_vals) - 1]) return peak_idxs, peak_vals
def compute(inputFilename, audio, pool, options): INFO('Doing segmentation...') type = options[namespace]['type'] minimumLength = options[namespace]['minimumSegmentsLength'] thumbnail = options[namespace]['thumbnailing'] if pool.value('metadata.duration_processed') < minimumLength: segments = [] INFO('No segments found!') else: segments = doSegmentation(inputFilename, audio, pool, options) #pool.setCurrentNamespace(namespace) pool.add(namespace + '.' + 'timestamps', essentia.array(segments))#, pool.GlobalScope) return segments
def extractor(filename): frameSize = 1024 hopSize = 512 fs = 44100 audio = ess.MonoLoader(filename = filename, sampleRate = fs)() w = ess.Windowing(type = 'hamming', normalized = False) # make sure these are same for MFCC and IDCT computation NUM_BANDS = 26 DCT_TYPE = 2 LIFTERING = 0 NUM_MFCCs = 13 spectrum = ess.Spectrum() mfcc = ess.MFCC(numberBands = NUM_BANDS, # numberCoefficients = NUM_MFCCs, # make sure you specify first N mfcc: the less, the more lossy (blurry) the smoothed mel spectrum will be weighting = 'linear', # computation of filter weights done in Hz domain (optional) normalize = 'unit_max', # htk filter normaliation to have constant height = 1 (optional) dctType = DCT_TYPE, # logType = 'log', liftering = LIFTERING) # corresponds to htk default CEPLIFTER = 22 idct = ess.IDCT(inputSize=NUM_MFCCs, outputSize=NUM_BANDS, dctType = DCT_TYPE, liftering = LIFTERING) all_melbands_smoothed = [] for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): spect = spectrum(w(frame)) melbands, mfcc_coeffs = mfcc(spect) melbands_smoothed = np.exp(idct(mfcc_coeffs)) # inverse the log taken in MFCC computation all_melbands_smoothed.append(melbands_smoothed) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T all_melbands_smoothed = essentia.array(all_melbands_smoothed).T # and plot plt.imshow(all_melbands_smoothed, aspect = 'auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"