def get_sines_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20): """ Perform framewise sinusoidal model in an audio :param audio: Audio either mono or stereo. Will be downsampled to mono :param sr: Samplerate used for the audio :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively. """ if audio.ndim > 1: audio = std.MonoMixer()(audio, audio.shape[1]) len_arrays = 0 for i, _ in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): len_arrays = i fft_algo = std.FFT() sine_anal = std.SineModelAnal(maxnSines=nsines, orderBy='frequency', minFrequency=1) sines = np.zeros([len_arrays + 1, 2, nsines], dtype=np.float32) + eps for i, frame in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): fft = fft_algo(frame) freqs, mags, _ = sine_anal(fft) sorting_indexes = np.argsort(freqs) freqs = freqs[sorting_indexes] mags = mags[sorting_indexes] sines[i, :] = [freqs, mags] if onlyfrecuencies: return sines[:, 0, :] else: return sines[:, 0, :], sines[:, 1, :]
def analysisSynthesis(params, signal): outsignal = array(0) signal = numpy.append(signal, zeros(params['frameSize']/2)) frames = cutFrames(params, signal) w = std.Windowing(type = "hann"); fft = std.FFT(size = params['frameSize']); ifft = std.IFFT(size = params['frameSize']); overl = std.OverlapAdd (frameSize = params['frameSize'], hopSize = params['hopSize'], gain = 1./params['frameSize']); counter = 0 for f in frames: # STFT analysis infft = fft(w(f)) # here we could apply spectral transformations outfft = infft # STFT synthesis ifftframe = ifft(outfft) of = ifftframe outframe = overl(of) if counter >= (params['frameSize']/(2*params['hopSize'])): outsignal = numpy.append(outsignal,outframe) counter += 1 return outsignal
def get_hpeaks_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20): """ Get Harmonic peaks in an audio :param audio: Audio either mono or stereo. Will be downsampled to mono :param sr: Samplerate used for the audio :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively. """ if audio.ndim > 1: audio = std.MonoMixer()(audio, audio.shape[1]) fft_algo = std.FFT() pyin = std.PitchYin() hpeaks = std.HarmonicPeaks() sine_anal = std.SineModelAnal(maxnSines=nsines, orderBy='frequency', minFrequency=1) sines = [] for i, frame in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): pitch, _ = pyin(frame) fft = fft_algo(frame) freqs, mags, _ = sine_anal(fft) sorting_indexes = np.argsort(freqs) freqs = freqs[sorting_indexes] mags = mags[sorting_indexes] non_zero_freqs = np.where(freqs != 0) freqs = freqs[non_zero_freqs] mags = mags[non_zero_freqs] freqs, mags = hpeaks(freqs, mags, pitch) sines.append([freqs, mags]) sines = np.array(sines) if onlyfrecuencies: return sines[:, 0, :] else: return sines[:, 0, :], sines[:, 1, :]
def get_onsets(self, _audio=[]): if _audio != []: audio = _audio else: audio = self.audio W = es.Windowing(type=self.winType) c2p = es.CartesianToPolar() fft = es.FFT() onsetDetection = es.OnsetDetection(method=self.onsetMethod, sampleRate=44100) onsets = es.Onsets(alpha=.2) # onsetIndex = [] pool = Pool() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase, = c2p(fft(W(frame))) onsetDetection.configure(method=self.onsetMethod) onsetFunction = onsetDetection(mag, phase) pool.add("onsetFunction", onsetFunction) DetectedOnsetsArray = onsets([pool["onsetFunction"]], [1]) return DetectedOnsetsArray
def analysisSynthesis(params, signal): outsignal = array(0) # framecutter > windowing > FFT > IFFT > OverlapAdd frames = cutFrames(params, signal) w = std.Windowing(type="hann") fft = std.FFT(size=params['frameSize']) ifft = std.IFFT(size=params['frameSize']) overl = std.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize']) counter = 0 for f in frames: #outframe = OverlapAdd(frameSize = params['frameSize'], hopSize = params['hopSize'])(IFFT(size = params['frameSize'])(FFT(size = params['frameSize'])(Windowing()(f)))) # STFT analysis infft = fft(w(f)) # here we could apply spectral transformations outfft = infft # STFT synthesis ifftframe = ifft(outfft) of = ifftframe outframe = overl(of) if counter >= (params['frameSize'] / (2 * params['hopSize'])): outsignal = numpy.append(outsignal, outframe) counter += 1 return outsignal
def segment(audio, hopSize, frameSize, rms_onset_threshold, mel_onset_threshold, flux_onset_threshold, onset_threshold): # init algorithms o_mel = estd.OnsetDetection(method='melflux') o_rms = estd.OnsetDetection(method='rms') o_hfc = estd.OnsetDetection(method='hfc') o_flux = estd.OnsetDetection(method='flux') fft = estd.FFT() c2p = estd.CartesianToPolar() pool = essentia.Pool() frame_generator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) w = estd.Windowing(type='hann') yin = estd.PitchYinFFT(frameSize=frameSize, minFrequency=40, maxFrequency=2500, interpolate=True) spectrum = estd.Spectrum() loudness = estd.Loudness() # control parameters attack = False detection = True mel_onset_value = 0 rms_onset_value = 0 # output variables onset = None sustain = None for index, frame in enumerate(frame_generator): mag, phase = c2p(fft(w(frame))) _, conf = yin(spectrum(w(frame))) loud = loudness(frame) mel_onset = o_mel(mag, phase) rms_onset = o_rms(mag, phase) hfc_onset = o_hfc(mag, phase) flux_onset = o_flux(mag, phase) pool.add('onsets_mel', mel_onset) pool.add('onsets_rms', rms_onset) pool.add('onsets_hfc', hfc_onset) pool.add('onsets_flux', flux_onset) pool.add('conf', conf) pool.add('loudness', loud) # condition for onset if detection and (flux_onset > flux_onset_threshold or mel_onset > mel_onset_threshold) \ and rms_onset > rms_onset_threshold and loud > onset_threshold: onset = index attack = True detection = False mel_onset_value = mel_onset rms_onset_value = rms_onset # condition for beginning of sustain if attack and conf > 0.5 and rms_onset < rms_onset_value * .05 and mel_onset < mel_onset_value * .3: attack = False sustain = index return onset, sustain
def stft(audio,params): # TODO: add fft size """ hop size, frame size""" hopSize, frameSize, wtype = params result = [] for frame in ess.FrameGenerator(audio, frameSize = frameSize, hopSize = hopSize): result.append(ess.FFT()(frame)) return np.abs(np.asarray(result)),hopSize
def nSinesRead(audio_vector): sineanal = estd.SineModelAnal(maxnSines=20) fft_calc = estd.FFT(size=2048) results = [] for frame in estd.FrameGenerator(audio_vector, 2048, 1024): spec = fft_calc(frame) results.append(sineanal(spec)) results = np.array(results) freqs = results[:, 0, :] mags = results[:, 1, :] return freqs, mags
def calculate_function(self): onset_func = [] fft = es.FFT() c2p = es.CartesianToPolar() for frame in es.FrameGenerator(self.signal, frameSize=self.frameSize, hopSize=self.hopSize): mag, phase, = c2p(fft(self.window(frame))) onset_func.append(self.calcOnsetFunc(mag, phase)) self.onsetFunction = np.array(onset_func, dtype=np.float32) self.onsetTime = np.arange( len(onset_func)) * (self.hopSize / self.sampleRate) self.onsetTime -= self.hopSize / self.sampleRate
def getOnsetFunctions(fname): logger = log.get_logger("rhythm") zeropadLen = params.Nfft - params.frmSize zz = np.zeros((zeropadLen, ), dtype='float32') frameCounter = 0 bufferFrame = np.zeros((params.Nfft / 2 + 1, )) logger.info('Reading audio file...') audio = ess.MonoLoader(filename=fname)() fft = ess.FFT(size=params.Nfft) # this gives us a complex FFT c2p = ess.CartesianToPolar( ) # and this turns it into a pair (magnitude, phase) pool = es.Pool() w = ess.Windowing(type="hamming") fTicks = params.fTicks poolName = 'features.flux' logger.info('Extracting Onset functions...') for frame in ess.FrameGenerator(audio, frameSize=params.frmSize, hopSize=params.hop): frmTime = params.hop / params.Fs * frameCounter + params.frmSize / ( 2.0 * params.Fs) zpFrame = np.hstack((frame, zz)) mag, phase, = c2p(fft(w(zpFrame))) magFlux = mag - bufferFrame bufferFrame = np.copy( mag) # Copying for the next iteration to compute flux for bands in range(params.numBands): chosenInd = (fTicks >= params.fBands[bands, 0]) & ( fTicks <= params.fBands[bands, 1]) magFluxBand = magFlux[chosenInd] magFluxBand = (magFluxBand + abs(magFluxBand)) / 2 oFn = magFluxBand.sum() if (math.isnan(oFn)): print("NaN found here") pass pool.add(poolName + str(bands), oFn) pass pool.add('features.time', frmTime) frameCounter += 1 if not np.mod(frameCounter, 10000): logger.info( str(frameCounter) + '/' + str(audio.size / params.hop) + '...') logger.info('Total frames processed = ' + str(frameCounter)) timeStamps = es.array([pool['features.time']]) all_feat = timeStamps for bands in range(params.numBands): feat_flux = es.array([pool[poolName + str(bands)]]) all_feat = np.vstack((all_feat, feat_flux)) pass return np.transpose(all_feat)
def OnsetsSegmentation(audio, frame_size=1024, frame_hop=512, windowing_type='hann', onsets_method='hfc'): #declaração dos algoritmos que serão usados spec = es_mode.Spectrum() fft = es_mode.FFT() c2p = es_mode.CartesianToPolar() od1 = es_mode.OnsetDetection(method=onsets_method) w = es_mode.Windowing(type=windowing_type) pool = es.Pool() #Função que será executada a cada frame def F(n): spectrum = spec(w(n)) mag, phase, = c2p(fft(w(n))) pool.add('features.spectrum', spectrum) pool.add('features.', phase) pool.add('features.onsetdetection', od1(spectrum, phase)) #define a função contínua de onsets para cada frame qtdFrames = inFrames(audio=audio, algorithm=F, frameSize=frame_size, hopSize=frame_hop) #print("Quantidade de frames: ", qtdFrames) audio_duration = es_mode.Duration()(audio) frame_rate = qtdFrames / audio_duration os = es_mode.Onsets(frameRate=frame_rate) #matriz de algoritmos de detecção de onset executados onset_detection_matrix = es.array([pool['features.onsetdetection']]) #segundo parâmetro é o vetor de pesos para cada detecção de onset onsets = os(onset_detection_matrix, [1]) end_times = es.array(np.append(onsets, audio_duration)) start_times = es.array(np.append([0], onsets)) segments = es_mode.Slicer(endTimes=end_times, startTimes=start_times, timeUnits="seconds")(audio) return segments, onsets
def analysisSynthesisStandard(params, signal): w = std.Windowing(type = "hann"); fft = std.FFT(size = params['frameSize']); ifft = std.IFFT(size = params['frameSize']); overl = std.OverlapAdd (frameSize = params['frameSize'], hopSize = params['hopSize'], gain = 1./params['frameSize']); # add half window of zeros to input signal to reach same ooutput length signal = numpy.append(signal, zeros(params['frameSize']/2)) frames = cutFrames(params, signal) outsignal = [] counter = 0 outframe = array(0) for f in frames: outframe = overl(ifft(fft(w(f)))) outsignal = numpy.append(outsignal,outframe) outsignal = outsignal [2*params['hopSize']:] return outsignal
def __detect_onsets(self, file, frame_size, hop_size, windowfnc, normalize) -> None: window = estd.Windowing(size=frame_size, type=windowfnc.value, normalized=normalize) fft = estd.FFT(size=frame_size) pool = es.Pool() pool_add = pool.add cart_to_polar = estd.CartesianToPolar() detect_onset = estd.OnsetDetection(method=self.algo) for frame in estd.FrameGenerator(file.audio, frameSize=frame_size, hopSize=hop_size): mag, phase, = cart_to_polar(fft(window(frame))) pool_add( "features." + self.algo, detect_onset(mag, phase), ) # The onsets algo expects a matrix of features which can be weighted self.onsets = estd.Onsets()(es.array([pool["features." + self.algo]]), [1])
def detect_onset(audio, index): # should be able to fetch the module from cache import essentia.standard as ess_std from essentia import array print("Subprocess {} starts".format(index)) processing_start = time() onset_detector = ess_std.OnsetDetection(method="complex") window = ess_std.Windowing(type="hann") fft = ess_std.FFT() c2p = ess_std.CartesianToPolar() onsets = ess_std.Onsets() frames = [] for frame in ess_std.FrameGenerator(audio, frameSize=1024, hopSize=512): mag, phase = c2p(fft(window(frame))) frames.append(onset_detector(mag, phase)) onsets_array = onsets(array([frames]), [1]) print("Subprocess {} finished. Elapsed time: {:.2}s".format( index, time() - processing_start)) return onsets_array
def analysis_synthesis_spr_model_standard(self, params, signal): pool = essentia.Pool() # Streaming Algos for Sine Model Analysis w = es.Windowing(type="hann") fft = es.FFT(size=params['fftSize']) smanal = es.SineModelAnal( sampleRate=params['sampleRate'], maxnSines=params['maxnSines'], magnitudeThreshold=params['magnitudeThreshold'], freqDevOffset=params['freqDevOffset'], freqDevSlope=params['freqDevSlope']) # Standard Algos for Sine Model Analysis smsyn = es.SineModelSynth(sampleRate=params['sampleRate'], fftSize=params['frameSize'], hopSize=params['hopSize']) ifft = es.IFFT(size=params['frameSize']) overlSine = es.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize'], gain=1. / params['frameSize']) overlres = es.OverlapAdd(frameSize=params['frameSize'], hopSize=params['hopSize'], gain=1. / params['frameSize']) fft_original = [] # analysis for frame in es.FrameGenerator(signal, frameSize=params["frameSize"], hopSize=params["hopSize"]): frame_fft = fft(w(frame)) fft_original.append(frame_fft) freqs, mags, phases = smanal(frame_fft) pool.add("frequencies", freqs) pool.add("magnitudes", mags) pool.add("phases", phases) # remove short tracks minFrames = int(params['minSineDur'] * params['sampleRate'] / params['hopSize']) pool = self.cleaningSineTracks(pool, minFrames) # synthesis sineTracksAudio = np.array([]) resTracksAudio = np.array([]) for frame_ix, _ in enumerate(pool["frequencies"]): sine_frame_fft = smsyn(pool["magnitudes"][frame_ix], pool["frequencies"][frame_ix], pool["phases"][frame_ix]) res_frame_fft = fft_original[frame_ix] - sine_frame_fft sine_outframe = overlSine(ifft(sine_frame_fft)) sineTracksAudio = np.append(sineTracksAudio, sine_outframe) res_outframe = overlres(ifft(res_frame_fft)) resTracksAudio = np.append(resTracksAudio, res_outframe) sineTracksAudio = sineTracksAudio.flatten()[-len(signal):] resTracksAudio = resTracksAudio.flatten()[-len(signal):] #print("len signal", len(signal), "len res", len(resTracksAudio)) return essentia.array(signal), essentia.array( sineTracksAudio), essentia.array(resTracksAudio)
def __init__(self, params, fsm=None): self.onset_threshold = params['onset_threshold'] self.offset_threshold = params['offset_threshold'] self.max_attack_time = params['max_attack_time'] self.max_release_time = params['max_release_time'] self.attack_slope_ratio = params['attack_slope_ratio'] self.release_slope_ratio = params['release_slope_ratio'] self.flux_threshold = params['flux_threshold'] self.mel_threshold = params['mel_threshold'] self.rms_threshold = params['rms_threshold'] self.conf_threshold = params['conf_threshold'] self.ratio_mel = params['ratio_mel'] self.ratio_rms = params['ratio_rms'] self.rms_threshold_value = 0 self.mel_threshold_vale = 0 self.fs = params['fs'] self.hop_size = params['hop_size'] self.max_attack_frames = seconds2frames(self.max_attack_time, fs=self.fs, hop_size=self.hop_size) self.max_release_frames = seconds2frames(self.max_release_time, fs=self.fs, hop_size=self.hop_size) self.ext_fsm = fsm # external state machine to send events to self.buffer = [] self.was_onset = False self.was_offset = False self.onset_counter = self.offset_counter = None self.onset_samples = 2 # number of consecutive samples to be above threshold self.offset_samples = 3 # number of consecutive samples to be below threshold self.peak_detect = GrowingSlopeEnd(max_frames=self.max_attack_frames, m=self.attack_slope_ratio) # essentia algorithms initialization self.o_mel = estd.OnsetDetection(method='melflux') self.o_rms = estd.OnsetDetection(method='rms') self.o_hfc = estd.OnsetDetection(method='hfc') self.o_flux = estd.OnsetDetection(method='flux') self.o_complex = estd.OnsetDetection(method='complex') self.fft = estd.FFT() self.c2p = estd.CartesianToPolar() self.w = estd.Windowing(type='hann') # STATE MACHINE self.fsm = Fysom({ 'initial': 'detecting', 'events': [{ 'name': 'onset', 'src': 'detecting', 'dst': 'attack' }, { 'name': 'peak', 'src': 'attack', 'dst': 'sustain' }, { 'name': 'offset', 'src': 'sustain', 'dst': 'detecting' }, { 'name': 'reset', 'src': ['detecting', 'attack', 'sustain'], 'dst': 'detecting' }], 'callbacks': { 'ondetecting': self.on_detecting, 'onattack': self.on_attack, 'onsustain': self.on_sustain, 'onbeforeonset': self.on_onset, 'onbeforepeak': self.on_peak, 'onbeforeoffset': self.on_offset } })
def detectBW(fpath: str, frame_size: float, hop_size: float, floor_db: float, oversample_f: int): if os.path.splitext(fpath)[1] != ".wav": raise ValueError( "file must be wav" ) #check if the file has a wav extension, else: raise error if not is_power2(oversample_f): raise ValueError("oversample factor can only be 1, 2 or 4" ) #check if the oversample factor is a power of two #audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, SR = estd.AudioLoader(filename=fpath)()[:2] if audio.shape[1] != 1: audio = (audio[:, 0] + audio[:, 1]) / 2 #if stereo: downmix to mono frame_size *= oversample_f #if an oversample factor is desired, apply it f = np.arange(int(frame_size / 2) + 1) / frame_size * SR #initialize frequency vector or xticks fc_index_arr = [] interpolated_spectrum = np.zeros( int(frame_size / 2) + 1) #initialize interpolated_spectrum array fft = estd.FFT(size=frame_size) #declare FFT function window = estd.Windowing(size=frame_size, type="hann") #declare windowing function for i, frame in enumerate( estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True)): frame = window(frame) #apply window to the frame frame_fft = abs(fft(frame)) frame_fft_db = 20 * np.log10( frame_fft + eps) #calculate frame fft values in db #energy_arr.append(energy(frame_fft)) interp_frame = compute_spectral_envelope( frame_fft_db, f, "linear" ) #compute the linear interpolation between the values of the maxima of the spectrum interp_frame = modify_floor(interp_frame, floor_db, log=True) fc_index = compute_fc(interp_frame) fc_index_arr.append(fc_index) if energy_verification(frame_fft, fc_index): fc_index_arr.append(fc_index) #else: # fc_index_arr.append(len(f)-1) interpolated_spectrum += interp_frame #append the values to window interpolated_spectrum /= i + 1 #energy_arr = normalise(energy_arr) #energy_mask = energy_arr>0.05 if len(fc_index_arr) == 0: fc_index_arr = [frame_size] hist = compute_histogram(fc_index_arr, f) fc, conf, binary = compute_mean_fc(hist, fc_index_arr, f, SR) print("filename: ", fpath, "mean_fc: ", fc, " conf: ", conf, " binary_result: ", binary) fig, ax = plt.subplots(3, 1, figsize=(15, 9)) ax[0].plot(fc_index_arr, "x") ax[1].stem(f, hist) ax[2].plot(f, interpolated_spectrum) ax[2].axvline(x=fc, color="r") plt.show()
import matplotlib.pyplot as plt import os import numpy as np DIR = "../Dataset/BW detection/" for file in os.listdir(DIR): fpath = os.path.join(DIR, file) name, extension = os.path.splitext(file) print(file) if extension == ".wav": x, SR, channels, _, br, _ = estd.AudioLoader(filename=fpath)() channels = x.shape[1] if channels != 1: x = (x[:, 0] + x[:, 1]) / 2 print(x.shape, SR, channels, br) window = estd.Windowing(size=len(x), type="hann") x = window(x) N = int(2**(np.ceil(np.log2(len(x))))) x = np.append(x, np.zeros(N - len(x))) x = esarr(x) tfX = estd.FFT()(x) tfX = 20 * np.log10(abs(tfX)) f = np.arange(int(len(x) / 2) + 1) / len(x) * SR plt.plot(f, tfX[:int(len(x) / 2) + 1]) plt.savefig(os.path.join(DIR, name + ".png")) plt.clf()
def analyze_hp(filename, segment_duration=20): lowlevelFrameSize = 2048 lowlevelHopSize = 1024 tonalFrameSize = 4096 tonalHopSize = 1024 # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) window = es.Windowing(type='blackmanharris62') fft = es.FFT() stft = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=lowlevelFrameSize, hopSize=lowlevelHopSize): stft.append(fft(window(frame))) # Librosa requires bins x frames format stft = np.array(stft).T D_harmonic, D_percussive = librosa.decompose.hpss(stft, margin=8) D_percussive_magnitude, _ = librosa.magphase(D_percussive) D_harmonic_magnitude, _ = librosa.magphase(D_harmonic) # Convert back to Essentia format (frames x bins) spectrum_harmonic = D_harmonic_magnitude.T specturm_percussive = D_percussive_magnitude.T # Processing for Mel bands melbands = es.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11025) # Normalize Mel bands: log10(1+x*10000) norm = es.UnaryOperator(type='identity', shift=1, scale=10000) log10 = es.UnaryOperator(type='log10') p = essentia.Pool() for spectrum_frame in spectrum_harmonic: p.add('melbands_harmonic', log10(norm(melbands(spectrum_frame)))) for spectrum_frame in specturm_percussive: p.add('melbands_percussive', log10(norm(melbands(spectrum_frame)))) return p
def detectBW(fpath: str, frame_size: float, hop_size: float, floor_db: float, oversample_f: int): # check if the file has a wav extension, else: raise error if os.path.splitext(fpath)[1] != ".wav": raise ValueError("file must be wav") # check if the oversample factor is a power of two if not is_power2(oversample_f): raise ValueError("oversample factor can only be 1, 2 or 4") # audio loader returns x, sample_rate, number_channels, md5, bit_rate, codec, of which only the first 3 are needed audio, SR = estd.AudioLoader(filename=fpath)()[:2] # if stereo: downmix to mono if audio.shape[1] != 1: audio = (audio[:, 0] + audio[:, 1]) / 2 frame_size *= oversample_f # if an oversample factor is desired, apply it fc_index_arr = [] hist = np.zeros(129) fft = estd.FFT(size=frame_size) # declare FFT function window = estd.Windowing(size=frame_size, type="hann") # declare windowing function avg_frames = np.zeros(int(frame_size / 2) + 1) max_nrg = max([ sum(abs(fft(window(frame)))**2) for frame in estd.FrameGenerator( audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True) ]) for i, frame in enumerate( estd.FrameGenerator(audio, frameSize=frame_size, hopSize=hop_size, startFromZero=True)): frame = window(frame) # apply window to the frame frame_fft = abs(fft(frame)) nrg = sum(frame_fft**2) if nrg >= 0.1 * max_nrg: for j in reversed(range(len(frame_fft))): if sum(frame_fft[j:] / j) >= 1e-5: j = int(j / frame_size * 128) fc_index_arr.append(j) hist[j] += nrg break avg_frames = avg_frames + frame_fft if len(fc_index_arr) == 0: fc_index_arr.append(128) hist[128] += 1 avg_frames /= (i + 1) most_likely_bin, conf, binary = compute_mean_fc(avg_frames, fc_index_arr, [], SR, hist=hist) most_likely_bin *= int(frame_size / 128) print("f={:0=2f}, conf={:0=2f}, problem={}".format( most_likely_bin * SR / frame_size, conf, str(binary))) fig, ax = plt.subplots(2, 1, figsize=(15, 9)) ax[0].plot(20 * np.log10(avg_frames + eps)) ax[0].axvline(x=most_likely_bin, color='r') ax[0].set_ylim(bottom=-120) ax[1].stem(hist) plt.show()
counter = 0 import matplotlib.pylab as plt if mode == 'standard': # create an audio loader and import audio file loader = std.MonoLoader(filename = inputFilename, sampleRate = 44100) audio = loader() print("Duration of the audio sample [sec]:") print(len(audio)/44100.0) w = std.Windowing(type = "hann"); fft = std.FFT(size = framesize); ifft = std.IFFT(size = framesize); overl = std.OverlapAdd (frameSize = framesize, hopSize = hopsize); awrite = std.MonoWriter (filename = outputFilename, sampleRate = 44100); for frame in std.FrameGenerator(audio, frameSize = framesize, hopSize = hopsize): # STFT analysis infft = fft(w(frame)) # here we could apply spectral transformations outfft = infft # STFT synthesis ifftframe = ifft(outfft) out = overl(ifftframe)