def compute_essentia_descriptors(audio_segment, actual_bar_beg, actual_bar_end): """ Computes the values of selected descriptors in the given audio segment. """ frames = FrameGenerator(audio_segment, frameSize=frameSize, hopSize=hopSize) mfccs_bar = [] bark_vector = [0] * 27 pool = essentia.Pool() total_frames = frames.num_frames() for frame in frames: frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) (frame_frequencies, frame_magnitudes) = spectralPeaks(frame_spectrum) mag, phase, = c2p(fft(frame_windowed)) pool.add('onsets.hfc', od(mag, phase)) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add('dissonance', frame_dissonance) # pool.add('zerocrossingrate', zerocrossingrate(frame)) mfcc_bands, mfcc_coeffs = mfcc(spectrum(window(frame))) mfccs_bar.append(mfcc_coeffs) frame_barkbands = barkbands(frame_spectrum) for i in range(27): bark_vector[i] += frame_barkbands[i] / total_frames onsets_hfc = onsets(essentia.array([pool['onsets.hfc']]), [1]) onset_rate = float(len(onsets_hfc)) / (actual_bar_end - actual_bar_beg) bar_dissonance = mean(pool["dissonance"]) return mfccs_bar, bark_vector, onset_rate, bar_dissonance
def essFalsestereoDetector(x: list, frameSize=1024, hopSize=512, correlationThreshold=0.98, percentageThreshold=90, channels=2, **kwargs): """Computes the correlation and consideres if the information in the two channels is the same Args: x: (list) input signal frameSize: (int) frame size for the analysis in falseStereoDetector hopSize: (int) hop_size for the analysis in falseStereoDetector correlationthreshold: (float) lower limit to decide if a file has correlation problems Returns: final_bool: (bool) True if the information is the same in both channels, False otherwise percentace: (float) How many frames were false stereo over all the frames """ if channels < 2: return 1, False, True rx, lx = StereoDemuxer()(x) mux = StereoMuxer() falseStereoDetector = FalseStereoDetector( correlationThreshold=correlationThreshold, **kwargs) lfg = FrameGenerator(lx, frameSize=frameSize, hopSize=hopSize, startFromZero=True) rfg = FrameGenerator(rx, frameSize=frameSize, hopSize=hopSize, startFromZero=True) problematicFrames = sum([ falseStereoDetector(mux(frameL, frameR))[0] for frameL, frameR in zip(lfg, rfg) ]) # problematicFrames = [] # for frameL, frameR in zip(lfg, rfg): # res, corr = falseStereoDetector(mux(frameL, frameR)) # problematicFrames.append(res) falseStereoDetector.reset() conf = float(sum(problematicFrames)) / float(lfg.num_frames()) return conf, conf > percentageThreshold / 100, False
def __call__(self, audio, SR, sumThreshold=1e-5): self.__reset__() if audio.ndim > 1: audio = np.sum(audio, axis=1) / audio.ndim fcIndexArr = [] self.hist = np.zeros(int(self.frameSize / 2 + 1)) fft = FFT(size=self.frameSize) # declare FFT function window = Windowing(size=self.frameSize, type="hann") # declare windowing function self.avgFrames = np.zeros(int(self.frameSize / 2) + 1) maxNrg = max([ sum(abs(fft(window(frame)))**2) for frame in FrameGenerator(audio, frameSize=self.frameSize, hopSize=self.hopSize, startFromZero=True) ]) for i, frame in enumerate( FrameGenerator(audio, frameSize=self.frameSize, hopSize=self.hopSize, startFromZero=True)): frame = window(frame) # apply window to the frame frameFft = abs(fft(frame)) nrg = sum(frameFft**2) if nrg >= 0.1 * maxNrg: for j in reversed(range(len(frameFft))): if sum(frameFft[j:] / j) >= sumThreshold: fcIndexArr.append(j) self.hist[j] += nrg break self.avgFrames = self.avgFrames + frameFft if len(fcIndexArr) == 0: fcIndexArr.append(int(self.frameSize / 2) + 1) self.hist[int(self.frameSize / 2)] += 1 self.avgFrames /= (i + 1) self.mostLikelyBin, conf, binary = self.__computeMeanFc( fcIndexArr, np.arange(int(self.frameSize / 2) + 2), hist=self.hist) return self.mostLikelyBin * SR / self.frameSize, conf, binary
def analyzer(samples): feats = [] for frame in FrameGenerator(samples, 256, 160): frame_feats = mel(spectrum(window(frame))) frame_feats = np.log(frame_feats + 1e-16) feats.append(frame_feats) return np.array(feats)
def essNoiseburstDetector(x: list, frameSize=1024, hopSize=512, detectionThreshold=0.05, percentageThrehold=5, **kwargs): """Computes the hum detection in x and computes a value over one of the path of the audio that has hum noise. Args: x: (list) input signal frameSize: (int) frame size for the analysis in Noise Burst Detector hopSize: (int) hopSize for the analysis in Noise Burst Detector detectionThreshold: (float) Returns: Part over one of the file whith hum noise """ noiseBurstDetector = NoiseBurstDetector(**kwargs) idxs = [] count = 0 total = 0 for i, frame in enumerate(FrameGenerator(x, frameSize=frameSize, hopSize=hopSize, startFromZero=True)): corrupt_samples = noiseBurstDetector(frame) corrupt_samples = hopSize * i + corrupt_samples if len(corrupt_samples) > int(detectionThreshold*frameSize): count += 1 for s in corrupt_samples: idxs.append(s) total += 1 percentage = round(100*count/total, 2) # del noiseBurstDetector_algo; del frame; del corrupt_samples; del x; return idxs, percentage, percentage > percentageThrehold
def extract_mel_feats(audio_fp, analyzers, fs=44100.0, nhop=512, nffts=[1024, 2048, 4096], log_scale=True): # Extract features loader = MonoLoader(filename=audio_fp, sampleRate=fs) samples = loader() feat_channels = [] for nfft, (window, spectrum, mel) in zip(nffts, analyzers): feats = [] for frame in FrameGenerator(samples, nfft, nhop): frame_feats = mel(spectrum(window(frame))) feats.append(frame_feats) feat_channels.append(feats) # Transpose to move channels to axis 2 instead of axis 0 feat_channels = np.transpose(np.stack(feat_channels), (1, 2, 0)) # Apply numerically-stable log-scaling # Value 1e-16 comes from inspecting histogram of raw values and picking some epsilon >2 std dev left of mean if log_scale: feat_channels = np.log(feat_channels + 1e-16) return feat_channels
def file_to_hpcp(filename): audio = MonoLoader(filename=filename)() windowing = Windowing(type='blackmanharris62') spectrum = Spectrum() spectral_peaks = SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = HPCP(maxFrequency=8000) # , # normalized='unitSum') #VERIFICAR QUE ISTO E O Q FAZ SENTIDO FAZER spec_group = [] hpcp_group = [] for frame in FrameGenerator(audio, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) return mean_hpcp
def hfc(filename): audio = MonoLoader(filename=filename, sampleRate=44100)() features = [] for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase =CartesianToPolar()(FFT()(Windowing(type='hann')(frame))) features.append(OnsetDetection(method='hfc')(mag, phase)) return Onsets()(array([features]),[1])
def noveltycurve(filename): audio = MonoLoader(filename=filename, sampleRate=44100)() band_energy = [] for frame in FrameGenerator(audio, frameSize = 1024, hopSize = 512): mag, phase, = CartesianToPolar()(FFT()(Windowing(type='hann')(frame))) band_energy.append(FrequencyBands()(mag)) novelty = NoveltyCurve()(band_energy) return Onsets()(np.array([novelty]),[1])
def outofPhaseDetector(x: list, frameSize=1024, hopSize=512, correlationThreshold=-0.8, percentageThreshold=90, channels=2, **kwargs): """Computes the correlation and flags the file if the file has a 90% of frames out of phase Args: x: (list) input signal frameSize: (int) frame size for the analysis in falseStereoDetector hopSize: (int) hop_size for the analysis in falseStereoDetector correlationthreshold: (float) higher limit to decide if a file has correlation problems Returns: final_bool: (bool) True if the information is the same in both channels, False otherwise percentace: (float) How many frames were false stereo over all the frames """ if channels < 2: return 1, False, True rx, lx = StereoDemuxer()(x) mux = StereoMuxer() falseStereoDetector = FalseStereoDetector(**kwargs) lfg = FrameGenerator(lx, frameSize=frameSize, hopSize=hopSize, startFromZero=True) rfg = FrameGenerator(rx, frameSize=frameSize, hopSize=hopSize, startFromZero=True) problematicFrames = 0 for frameL, frameR in zip(lfg, rfg): _, corr = falseStereoDetector(mux(frameL, frameR)) problematicFrames += corr < correlationThreshold falseStereoDetector.reset() conf = problematicFrames / lfg.num_frames() return conf, conf > percentageThreshold / 100, False
def shared_main(source, dest, display_result): source_audio = _loader(source) destination_audio = _loader(dest) source_frame = FrameGenerator(source_audio, frameSize=2048, hopSize=512) destination_frame = FrameGenerator(destination_audio, frameSize=2048, hopSize=512) window = Windowing(type='hann') # window function spectrum = Spectrum() # spectrum function pitch_yin_fft = PitchYinFFT() # pitch extractor pitch_saliennce = PitchSalience() loudness = Loudness() # draw_plot(source_frame, window, spectrum, pitch_yin_fft) min_cost, match_result = compare(source_frame, destination_frame, window, \ spectrum, pitch_yin_fft, 5, 1, 1, display_result, loudness) return min_cost, match_result
def calculateDownbeats(self, audio, bpm, phase): # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function ON LOWPASSED SIGNAL spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() od_csd = OnsetDetection(method='complex') lowpass = LowPass(cutoffFrequency=1500) pool = Pool() # TODO test faster (numpy) way #audio = lowpass(audio) for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): mag, ph = c2p(fft(w(frame))) pool.add('onsets.complex', od_csd(mag, ph)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 7 (experimental): Determine downbeat locations as subsequence with highest complex spectral difference for i in range(4): phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames + i * self.numFramesPerBeat(bpm), np.size(novelty_hwr), 4 * self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.downbeat', np.sum(novelty_hwr[frames]) / np.size(frames)) plt.subplot(4, 1, i + 1) plt.plot(novelty_hwr) for f in frames: plt.axvline(x=f) print pool['output.downbeat'] downbeatIndex = np.argmax(pool['output.downbeat']) plt.show() # experimental return 1.0 * self.beats[downbeatIndex::4]
def essClickDetector(x, frameSize=1024, hopSize=512, percentageThrehold=1, **kwargs): """Breaks x into frames and computes the start and end indexes. Args: x: (list) input signal frameSize: (int) frame size for the analysis in Click Detector hopSize: (int) hopSize for the analysis in Click Detector Kwargs: same **kwargs for ClickDetector Returns: starts: start indexes ends: end indexes percentage of frames with the issue """ clickDetector = ClickDetector(frameSize=frameSize, hopSize=hopSize, **kwargs) ends = [] starts = [] count = 0 total = 0 for frame in FrameGenerator(x, frameSize=frameSize, hopSize=hopSize, startFromZero=True): frame_starts, frame_ends = clickDetector(frame) for s in frame_starts: starts.append(s) for e in frame_ends: ends.append(e) if len(frame_starts) + len(frame_ends) != 0: count += 1 total += 1 percentage = round(100 * count / total, 2) # print("Number of frames:", i+1) # del x; del frame; del frame_ends; del frame_starts; return starts, ends, percentage, percentage > percentageThrehold
def rms_centroids(filename, frameSize=1024, hopSize=512, sampleRate=44100): # load our audio into an array audio = MonoLoader(filename=filename, sampleRate=44100)() # create the pool and the necessary algorithms w = Windowing() spec = Spectrum() rms = RMS() centroid = Centroid(range=int(sampleRate / 2)) cs = [] rmss = [] # compute the centroid for all frames in our audio and add it to the pool for frame in FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): sf = spec(w(frame)) cs.append(centroid(sf)) rmss.append(rms(sf)) return np.array(rmss), np.array(cs)
def hcdf(filename): audio = MonoLoader(filename=filename)() windowing = Windowing(type='hann') for frame in FrameGenerator(audio, frameSize=32768, hopSize=4096): windowed = windowing(frame) print('window', windowed) # ConstantQ transform # constant_q = ConstantQ(binsPerOctave=36, minFrequency=110, maxFrequency=3520, sampleRate=11025) # kk = constant_q(windowed) # 12 bin tunned Chromagram # pedirle al ruso que lo ponga chroma = Chromagram(numberBins=12, binsPerOctave=36, minFrequency=110, windowType='hann') # maxFrequency=3520 pitch_class_vectors = chroma(frame) print('pitch_class_vectors', pitch_class_vectors)
def ninos(filename,gamma=0.94): """ reference: Mounir, M., Karsmakers, P., & Van Waterschoot, T. (2016). Guitar note onset detection based on a spectral sparsity measure. European Signal Processing Conference. https://doi.org/10.1109/EUSIPCO.2016.7760394 """ N = 2048 hopSize = int(N/10) J = int(N*gamma/2) audio = MonoLoader(filename=filename, sampleRate=44100)() mag = [] for frame in FrameGenerator(audio, frameSize = N, hopSize = hopSize): m = CartesianToPolar()(FFT()(Windowing(type='hann')(frame)))[0] m = np.asarray(m) idx = np.argsort(m)[::-1][:J] mag.append(m[idx]) mag = np.asarray(mag) x2 = mag*mag inos=np.sum(x2,axis=1)/(np.sum(x2*x2,axis=1)**(0.25)) ninos = inos/(J**(0.25)) return OnsetPeakPickingProcessor(threshold=0.03,fps=44100/hopSize)(ninos)
def run(self, audio): # Calculate the melflux onset detection function pool = Pool() w = Windowing(type='hann') fft = np.fft.fft od_flux = OnsetDetection(method='melflux') for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): pool.add('audio.windowed_frames', w(frame)) fft_result = fft(pool['audio.windowed_frames']).astype('complex64') fft_result_mag = np.absolute(fft_result) fft_result_ang = np.angle(fft_result) self.fft_mag_1024_512 = fft_result_mag self.fft_phase_1024_512 = fft_result_ang for mag, phase in zip(fft_result_mag, fft_result_ang): pool.add('onsets.complex', od_flux(mag, phase)) odf = pool['onsets.complex'] # Given the ODF, calculate the tempo and the phase tempo, tempo_curve, phase, phase_curve = BeatTracker.get_tempo_and_phase_from_odf( odf, self.HOP_SIZE) # Calculate the beat annotations spb = 60. / tempo #seconds per beat beats = (np.arange(phase, (np.size(audio) / self.SAMPLE_RATE) - spb + phase, spb).astype('single')) # Store all the results self.bpm = tempo self.phase = phase self.beats = beats self.onset_curve = BeatTracker.hwr(pool['onsets.complex'])
def f_essentia_extract(Audio): ## METODOS DE LIBRERIA QUE DETECTAN DONDE OCURRE CADA NOTA RESPECTO AL TIEMPO od2 = OnsetDetection(method='complex') # Let's also get the other algorithms we will need, and a pool to store the results w = Windowing(type='hann') fft = FFT() # this gives us a complex FFT c2p = CartesianToPolar( ) # and this turns it into a pair (magnitude, phase) pool = essentia.Pool() # Computing onset detection functions. for frame in FrameGenerator(Audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(w(frame))) pool.add('features.complex', od2(mag, phase)) ## inicio de cada "nota" onsets = Onsets() tiempos_detectados_essentia = onsets( essentia.array([pool['features.complex']]), [1]) #print(tiempos_detectados_essentia) return tiempos_detectados_essentia
def detectBW(audio: list, SR: float, frame_size=256, hop_size=128, floor_db=-90, oversample_f=1): frame_size *= oversample_f # if an oversample factor is desired, apply it fc_index_arr = [] fft = FFT(size=frame_size) # declare FFT function window = Windowing(size=frame_size, type="hann") # declare windowing function for frame in FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True): frame_fft = abs(fft(window(frame))) frame_fft_db = 20 * np.log10( frame_fft + eps) # calculate frame fft values in db # compute the linear interpolation between the values of the maxima of the spectrum interp_frame = compute_spectral_envelope(frame_fft_db, "linear") interp_frame = modify_floor(interp_frame, floor_db, log=True) fc_index = compute_fc(interp_frame) if energy_verification(frame_fft, fc_index): fc_index_arr.append(fc_index) if len(fc_index_arr) == 0: fc_index_arr = [frame_size] fc_bin, conf, binary = compute_mean_fc(fc_index_arr, np.arange(len(frame_fft)), SR) # print("mean_fc: ", fc_bin*SR/frame_size ," conf: ", conf ," binary_result: ", binary) return fc_bin * SR / frame_size, conf, binary
def essSaturationDetector(x: list, frameSize=1024, hopSize=512, percentageThrehold=5, **kwargs): """Breaks x into frames and computes the start and end indexes Args: x: (list) input signal frameSize: (int) frame size for the analysis in Saturation Detector hopSize: (int) hopSize for the analysis in Saturation Detector percentageThrehold: (int) Kwargs: Same **kwargs than the ones for SaturationDetector Returns: starts: start indexes ends: end indexes percentage of frames with the issue """ saturationDetector = SaturationDetector(frameSize=frameSize, hopSize=hopSize, **kwargs) ends = [] starts = [] count = 0 total = 0 for frame in FrameGenerator(x, frameSize=frameSize, hopSize=hopSize, startFromZero=True): frame_starts, frame_ends = saturationDetector(frame) for s in frame_starts: starts.append(s) for e in frame_ends: ends.append(e) if len(frame_starts) + len(frame_ends) != 0: count += 1 total += 1 percentage = round(100*count/total, 2) return starts, ends, percentage, percentage > percentageThrehold
beats = beats * frames_per_second spec = Spectrum(size=FRAME_SIZE - FRAME_SIZE % 2) w = Windowing(type='hann') spectrum = Spectrum() # FFT would return complex FFT, we only want magnitude mfcc = MFCC() pool = Pool() # Step 0: align audio with phase beats = beats - 0.5 start_sample = int((phase) * (44100.0 * 60 / bpm)) # Step 1: Calculate framewise MFCC for frame in FrameGenerator(audio[start_sample:], frameSize=FRAME_SIZE, hopSize=HOP_SIZE): mfcc_bands, mfcc_coeffs = mfcc( spectrum(w(frame[:FRAME_SIZE - (FRAME_SIZE % 2)]))) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) # Step 2: correlate print np.shape(pool['lowlevel.mfcc']) matrix = 1 - pairwise_distances(pool['lowlevel.mfcc'], metric='cosine') plt.imshow(matrix, aspect='auto', interpolation='nearest', vmin=np.percentile(matrix, 1.0), vmax=np.percentile(matrix, 99.0))
int(progress), '%', '\t Time elapsed: ', current - start, ' seconds') ainp = MonoLoader(filename=f, sampleRate=params['fs'])() fHz = dict_fmap[f.split('/')[-1].split('_')[0]] ccoeff = params['fs'] / (2 * fHz) params_ceps['ceps_coeffs'] = (int)(ccoeff) # Extract relevant information(name,midi) name = f.split('/')[-1][:-4] midival = (int)(69 + 12 * np.log2(fHz / 440)) # Select the relevant portion of audio to compute cc using essentia's silence detection function s_3 = StartStopSilence(threshold=-30) for frame in FrameGenerator(ainp, frameSize=params['N'], hopSize=params['H']): sss = s_3(frame) start_frame = (int)(sss[0] * params['H']) stop_frame = (int)(sss[1] * params['H']) ainp = ainp[start_frame:stop_frame] # # Condition to ensure that each has at least num_frames! # if(ainp.shape[0] < num_frames): # continue # Compute the cc's op = cc_calc(ainp, params, params_ceps) # Store cc'c + other relevant parameters in dict results[name] = {}
def run(self, audio): # TODO put this in some util class # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = FFT() c2p = CartesianToPolar() od_csd = OnsetDetection(method='complex') pool = Pool() # TODO test faster (numpy) way for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): mag, phase = c2p(fft(w(frame))) pool.add('onsets.complex', od_csd(mag, phase)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = self.adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 3: then calculate the autocorrelation of this signal novelty_autocorr = self.autocorr(novelty_hwr) # Step 4: Sum over constant intervals to detect most likely BPM valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm) for bpm in valid_bpms: frames = ( np.round( np.arange(0, np.size(novelty_autocorr), self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.bpm', np.sum(novelty_autocorr[frames]) / np.size(frames)) bpm = valid_bpms[np.argmax(pool['output.bpm'])] # Step 5: Calculate phase information valid_phases = np.arange(0.0, 60.0 / bpm, 0.001) # Valid phases in SECONDS for phase in valid_phases: # Convert phase from seconds to frames phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames, np.size(novelty_hwr), self.numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.phase', np.sum(novelty_hwr[frames]) / np.size(frames)) phase = valid_phases[np.argmax(pool['output.phase'])] print 'PHASE', phase # Step 6: Determine the beat locations spb = 60. / bpm #seconds per beat beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase, spb).astype('single')) # Store all the results self.bpm = bpm self.phase = phase self.beats = beats self.downbeats = self.calculateDownbeats(audio, bpm, phase)
def __iter__(self) -> Iterable['AudioSequence']: return (self.new(x) for x in FrameGenerator( self.audio, frameSize=self.fs, hopSize=self.hs))
def run(self, audio): def numFramesPerBeat(bpm): return (60.0 * self.SAMPLE_RATE) / (self.HOP_SIZE * bpm) def autocorr(x): result = np.correlate(x, x, mode='full') return result[result.size / 2:] def adaptive_mean(x, N): return np.convolve(x, [1.0] * int(N), mode='same') / N # Step 0: calculate the CSD (Complex Spectral Difference) features # and the associated onset detection function spec = Spectrum(size=self.FRAME_SIZE) w = Windowing(type='hann') fft = np.fft.fft c2p = CartesianToPolar() od_csd = OnsetDetection(method='melflux') pool = Pool() for frame in FrameGenerator(audio, frameSize=self.FRAME_SIZE, hopSize=self.HOP_SIZE): pool.add('audio.windowed_frames', w(frame)) fft_result = fft(pool['audio.windowed_frames']).astype('complex64') fft_result_mag = np.absolute(fft_result) fft_result_ang = np.angle(fft_result) for mag, phase in zip(fft_result_mag, fft_result_ang): pool.add('onsets.complex', od_csd(mag, phase)) # Step 1: normalise the data using an adaptive mean threshold novelty_mean = adaptive_mean(pool['onsets.complex'], 16.0) # Step 2: half-wave rectify the result novelty_hwr = (pool['onsets.complex'] - novelty_mean).clip(min=0) # Step 3: then calculate the autocorrelation of this signal novelty_autocorr = autocorr(novelty_hwr) # Step 4: Sum over constant intervals to detect most likely BPM valid_bpms = np.arange(self.minBpm, self.maxBpm, self.stepBpm) for bpm in valid_bpms: frames = ( np.round( np.arange(0, np.size(novelty_autocorr), numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.bpm', np.sum(novelty_autocorr[frames]) / np.size(frames)) bpm = valid_bpms[np.argmax(pool['output.bpm'])] # Step 5: Calculate phase information valid_phases = np.arange(0.0, 60.0 / bpm, 0.001) # Valid phases in SECONDS for phase in valid_phases: # Convert phase from seconds to frames phase_frames = (phase * 44100.0) / (512.0) frames = ( np.round( np.arange(phase_frames, np.size(novelty_hwr), numFramesPerBeat(bpm))).astype('int') )[: -1] # Discard last value to prevent reading beyond array (last value rounded up for example) pool.add('output.phase', np.sum(novelty_hwr[frames]) / np.size(frames)) phase = valid_phases[np.argmax(pool['output.phase'])] # Step 6: Determine the beat locations spb = 60. / bpm #seconds per beat beats = (np.arange(phase, (np.size(audio) / 44100) - spb + phase, spb).astype('single')) # Store all the results self.bpm = bpm self.phase = phase self.beats = beats
def to_frames(self, audio: np.array) -> list: return list(FrameGenerator(audio, frameSize=self.fs, hopSize=self.hs))
s1 = Song(sys.argv[1]) s1.open() s1.openAudio() audio = s1.audio FRAME_SIZE = int(44100 * (60.0 / s1.tempo) / 2) HOP_SIZE = FRAME_SIZE / 2 def adaptive_mean(x, N): return np.convolve(x, [1.0] * int(N), mode='same') / N pool = Pool() for frame in FrameGenerator(audio, frameSize=FRAME_SIZE, hopSize=HOP_SIZE): pool.add('lowlevel.rms', np.average(frame**2)) adaptive_mean_rms = adaptive_mean( pool['lowlevel.rms'], 64) # Mean of rms in window of [-4 dbeats, + 4 dbeats] mean_rms = np.mean(adaptive_mean_rms) adaptive_mean_odf = adaptive_mean(s1.onset_curve, int((44100 * 60 / s1.tempo) / 512) * 4) # -4 dbeats, +4 dbeats adaptive_mean_odf_2 = adaptive_mean(adaptive_mean_odf, 8) mean_odf = np.mean(adaptive_mean_odf) plt.plot(np.linspace(0.0, 1.0, adaptive_mean_rms.size), adaptive_mean_rms / max(adaptive_mean_rms), c='r')
def run(self): loader = essentia.standard.MonoLoader(filename=sys.argv[1])() msg = "0" socks = [ socket.socket(socket.AF_INET, socket.SOCK_DGRAM), socket.socket(socket.AF_INET, socket.SOCK_DGRAM), socket.socket(socket.AF_INET, socket.SOCK_DGRAM) ] beat_count = 0 N_BEATS = 4 global frame_g frame_q_o = Queue.Queue() result_q_o = Queue.Queue() frame_q_b = Queue.Queue() result_q_b = Queue.Queue() ot = OnsetThread(frame_q_o, result_q_o) bt = BeatThread(frame_q_b, result_q_b) ot.daemon = True bt.daemon = True ot.start() bt.start() for frame in FrameGenerator(loader, frameSize=CHUNK, hopSize=CHUNK, startFromZero=True): frame_g = frame if self.stoprequest.isSet(): break else: self.play_started.get() start_time = time.time() frame_q_b.put(True) frame_q_o.put(True) energy = Energy() frame_energy = energy(frame) pool2 = result_q_o.get() pool = result_q_b.get() bpm = pool['Rhythm.bpm'] spb = 60.0 / bpm if bpm > 0 else 0.0 look_ahead_n = 16 beats = pool['Rhythm.ticks'] onset = pool2['Rhythm.onsetRate'] for i, b in enumerate(beats): beat_count += 1 if (beat_count % N_BEATS == 0): half_beat = start_time + b + spb * (look_ahead_n / 2) next_beat = start_time + b + spb * look_ahead_n for i, sock in enumerate(socks): sock.sendto( str(half_beat) + "," + str(frame_energy) + "," + str(onset), (UDP_IP, UDP_PORT[i])) sock.sendto( str(next_beat) + "," + str(frame_energy) + "," + str(onset), (UDP_IP, UDP_PORT[i])) ''' print "beats: ", pool['Rhythm.ticks'] print "energy: ", frame_energy print "onset: ", pool2['Rhythm.onsetRate'] print "bpm: ", pool['Rhythm.bpm'] print "spb: ", spb print "bar started: ", start_time print "time now: ", time.time() print "next_beat: ", next_beat ''' self.extract_done.put(True) ot.stop() bt.stop()
def feature_allframes(audio, beats, frame_indexer = None): # Initialise the algorithms FRAME_SIZE = 1024 HOP_SIZE = 512 spec = Spectrum(size = FRAME_SIZE) w = Windowing(type = 'hann') fft = np.fft.fft od_csd = OnsetDetection(method = 'complex') od_hfc = OnsetDetection(method = 'flux') pool = Pool() # Calculate onset detection curve on audio for frame in FrameGenerator(audio, frameSize = FRAME_SIZE, hopSize = HOP_SIZE): pool.add('windowed_frames', w(frame)) fft_result = fft(pool['windowed_frames']).astype('complex64') fft_result_mag = np.absolute(fft_result) fft_result_ang = np.angle(fft_result) for mag,phase in zip(fft_result_mag, fft_result_ang): pool.add('onsets.flux', od_hfc(mag, phase)) # Normalize and half-rectify onset detection curve def adaptive_mean(x, N): return np.convolve(x, [1.0]*int(N), mode='same')/N novelty_mean = adaptive_mean(pool['onsets.flux'], 16.0) novelty_hwr = (pool['onsets.flux'] - novelty_mean).clip(min=0) novelty_hwr = novelty_hwr / np.average(novelty_hwr) # For every frame in frame_indexer, if frame_indexer is None: frame_indexer = list(range(4,len(beats) - 1)) # Exclude first frame, because it has no predecessor to calculate difference with # Feature: correlation between current frame onset detection f and of previous frame # Feature: correlation between current frame onset detection f and of next frame # Feature: diff between correlation between current frame onset detection f and corr cur and next onset_integrals = np.zeros((2 * len(beats), 1)) frame_i = (np.array(beats) * 44100.0/ HOP_SIZE).astype('int') onset_correlations = np.zeros((len(beats), 21)) for i in [i for i in range(len(beats)) if (i in frame_indexer) or (i+1 in frame_indexer) or (i-1 in frame_indexer) or (i-2 in frame_indexer) or (i-3 in frame_indexer) or (i-4 in frame_indexer) or (i-5 in frame_indexer) or (i-6 in frame_indexer) or (i-7 in frame_indexer)]: half_i = int((frame_i[i] + frame_i[i+1]) / 2) cur_frame_1st_half = novelty_hwr[frame_i[i] : half_i] cur_frame_2nd_half = novelty_hwr[half_i : frame_i[i+1]] onset_integrals[2*i] = np.sum(cur_frame_1st_half) onset_integrals[2*i + 1] = np.sum(cur_frame_2nd_half) # Step 2: Calculate the cosine distance between the MFCC values for i in frame_indexer: onset_correlations[i][0] = max(np.correlate(novelty_hwr[frame_i[i-1] : frame_i[i]], novelty_hwr[frame_i[i] : frame_i[i+1]], mode='valid')) # Only 1 value onset_correlations[i][1] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+1] : frame_i[i+2]], mode='valid')) # Only 1 value onset_correlations[i][2] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+2] : frame_i[i+3]], mode='valid')) # Only 1 value onset_correlations[i][3] = max(np.correlate(novelty_hwr[frame_i[i] : frame_i[i+1]], novelty_hwr[frame_i[i+3] : frame_i[i+4]], mode='valid')) # Only 1 value # Difference in integrals of novelty curve between frames # Quantifies the difference in number and prominence of onsets in this frame onset_correlations[i][4] = onset_integrals[2*i] - onset_integrals[2*i-1] onset_correlations[i][5] = onset_integrals[2*i+2] + onset_integrals[2*i+3] - onset_integrals[2*i-1] - onset_integrals[2*i-2] for j in range(1,16): onset_correlations[i][5 + j] = onset_integrals[2*i + j] - onset_integrals[2*i] # Include the MFCC coefficients as features result = onset_correlations[frame_indexer] return preprocessing.scale(result)