def get_beat_chunks(filename, bpm_restrict=None): audio = std.MonoLoader(filename=filename)() hpcp = std.HPCP() spectrum = std.Spectrum() speaks = std.SpectralPeaks() large_speaks = std.SpectralPeaks(maxPeaks=2000) tivs = [] sr = 44100 bpm = get_tempo(filename) tivs_framewise = [] if bpm_restrict != None and bpm_restrict != bpm: raise ValueError sec_beat = (60 / bpm) beats = np.arange(0, len(audio) / sr, sec_beat) beats = np.append(beats, len(audio) / sr) for i in range(1, len(beats)): segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)] cutter = std.FrameGenerator(segmented_audio) for sec in cutter: spec = spectrum(sec) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs_framewise.append(chroma) np2_seg_audio = zeropad_next_power_2(segmented_audio) spec = spectrum(np2_seg_audio) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs.append(chroma) # Calculate the whole TIV np2_whole = zeropad_next_power_2(audio) spec = spectrum(np2_whole) freq, mag = large_speaks(spec) chroma_whole = hpcp(freq, mag) return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def calc_chromagram(self): # save the results in the stft_pool self.chromagram = [] hpcp = es.HPCP( size=12, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, weightType='cosine', nonLinear=False, windowSize=1., sampleRate=self.sample_rate) spectrum = es.Spectrum(size=self.fft_size) spectral_peaks = es.SpectralPeaks(sampleRate=self.sample_rate) for frame in es.FrameGenerator(self.audio, frameSize=self.frame_size, hopSize=self.hop_size, startFromZero=True): frame = array(frame * self.window) freqs, mags = spectral_peaks(spectrum(frame)) chroma = hpcp(freqs, mags) self.chromagram.append(chroma) self.chromagram = array(self.chromagram) self.timeAxSec = np.arange(len( self.chromagram)) * self.hop_size / float(self.sample_rate)
def getHPCPEssentia(XAudio, Fs, winSize, hopSize, squareRoot=False, NChromaBins=36, NHarmonics = 0): """ Wrap around the essentia library to compute HPCP features :param XAudio: A flat array of raw audio samples :param Fs: Sample rate :param winSize: Window size of each STFT window :param hopSize: Hop size between STFT windows :param squareRoot: Do square root compression? :param NChromaBins: How many chroma bins (default 36) :returns H: An (NChromaBins x NWindows) matrix of all \ chroma windows """ import essentia from essentia import Pool, array import essentia.standard as ess spectrum = ess.Spectrum() window = ess.Windowing(size=winSize, type='hann') spectralPeaks = ess.SpectralPeaks() hpcp = ess.HPCP(size=NChromaBins, harmonics=NHarmonics) H = [] for frame in ess.FrameGenerator(array(XAudio), frameSize=winSize, hopSize=hopSize, startFromZero=True): S = spectrum(window(frame)) freqs, mags = spectralPeaks(S) H.append(hpcp(freqs, mags)) H = np.array(H) H = H.T if squareRoot: H = sqrtCompress(H) return H
def file_to_hpcp(loop): loop = e.array(loop) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() spectral_peaks = es.SpectralPeaks(orderBy='magnitude', magnitudeThreshold=0.001, maxPeaks=20, minFrequency=20, maxFrequency=8000) hpcp = es.HPCP(maxFrequency=8000) spec_group = [] hpcp_group = [] for frame in es.FrameGenerator(loop, frameSize=1024, hopSize=512): windowed = windowing(frame) fft = spectrum(windowed) frequencies, magnitudes = spectral_peaks(fft) final_hpcp = hpcp(frequencies, magnitudes) spec_group.append(fft) hpcp_group.append(final_hpcp) mean_hpcp = np.mean(np.array(hpcp_group).T, axis=1) #normalize to 1 mean_hpcp = mean_hpcp / mean_hpcp.max() return mean_hpcp
def _extract_pitch_contours(self, audio): # Hann window with x4 zero padding run_windowing = estd.Windowing( # pylint: disable-msg=E1101 zeroPadding=3 * self.frame_size) run_spectrum = estd.Spectrum( # pylint: disable-msg=E1101 size=self.frame_size * 4) run_spectral_peaks = estd.SpectralPeaks( # pylint: disable-msg=E1101 minFrequency=self.min_frequency, maxFrequency=self.max_frequency, magnitudeThreshold=self.magnitude_threshold, sampleRate=self.sample_rate, orderBy='magnitude') # convert unit to cents, PitchSalienceFunction takes 55 Hz as the # default reference run_pitch_salience_function = \ estd.PitchSalienceFunction( # pylint: disable-msg=E1101 binResolution=self.bin_resolution) run_pitch_salience_function_peaks = \ estd.PitchSalienceFunctionPeaks( # pylint: disable-msg=E1101 binResolution=self.bin_resolution, minFrequency=self.min_frequency, maxFrequency=self.max_frequency) run_pitch_contours = estd.PitchContours( # pylint: disable-msg=E1101 hopSize=self.hop_size, binResolution=self.bin_resolution, peakDistributionThreshold=self.peak_distribution_threshold) # compute frame by frame pool = Pool() for frame in estd.FrameGenerator( audio, # pylint: disable-msg=E1101 frameSize=self.frame_size, hopSize=self.hop_size): frame = run_windowing(frame) spectrum = run_spectrum(frame) peak_frequencies, peak_magnitudes = run_spectral_peaks(spectrum) salience = run_pitch_salience_function(peak_frequencies, peak_magnitudes) salience_peaks_bins, salience_peaks_contour_saliences = \ run_pitch_salience_function_peaks(salience) if not np.size(salience_peaks_bins): salience_peaks_bins = np.array([0]) if not np.size(salience_peaks_contour_saliences): salience_peaks_contour_saliences = np.array([0]) pool.add('allframes_salience_peaks_bins', salience_peaks_bins) pool.add('allframes_salience_peaks_contourSaliences', salience_peaks_contour_saliences) # post-processing: contour tracking contours_bins, contour_saliences, contours_start_times, duration = \ run_pitch_contours( [f.tolist() for f in pool['allframes_salience_peaks_bins']], [f.tolist() for f in pool['allframes_salience_peaks_contourSaliences']]) return contours_bins, contours_start_times, contour_saliences, duration
def FeatureExtraction_Recording(recording, params): numBins = params.numbins fs = params.fs # LOAD Audio file Audio = ess.MonoLoader(filename=recording.path, sampleRate=fs)() Audio = ess.DCRemoval()(Audio) # PREPROCESSING / DC removal Audio = ess.EqualLoudness()(Audio) # PREPROCESSING - Equal Loudness Filter # Windowing Parameters (first converting from msec to number of samples) # assuring windowSize and hopSize are even windowSize = round(fs * params.windowSize / 1000) windowSize = int(windowSize / 2) * 2 hopSize = round(fs * params.hopSize / 1000) hopSize = int(hopSize / 2) * 2 tonic = float(recording.tonic) # FRAME-BASED Spectral Analysis hpcp = [] for frame in ess.FrameGenerator(Audio, frameSize=windowSize, hopSize=hopSize, startFromZero=True): frame = ess.Windowing(size=windowSize, type=params.windowFunction)(frame) mX = ess.Spectrum(size=windowSize)(frame) mX[mX < np.finfo(float).eps] = np.finfo(float).eps # EXTRACT frequency and magnitude information of the harmonic spectral peaks freq, mag = ess.SpectralPeaks()(mX) # harmonic pitch-class profiles hpcp.append( ess.HPCP(normalized='unitSum', referenceFrequency=tonic, size=numBins, windowSize=12 / numBins)(freq, mag)) recording.chroma_framebased = np.array(hpcp) # FEATURE SUMMARIZATION mean_chroma = [] # global Mean of HPCP vectors std_chroma = [] # global standard deviation of HPCP vectors for j in range(numBins): tmp = [] for i in range(len(recording.chroma_framebased)): tmp.append(recording.chroma_framebased[i][j]) mean_chroma.append(np.mean(tmp)) std_chroma.append(np.std(tmp)) recording.chroma_mean = mean_chroma recording.chroma_std = std_chroma
def compute_features(self, audio): """Computes the specified Essentia features from the audio array.""" features = [] for frame in ES.FrameGenerator(audio, frameSize=self.frame_size, hopSize=self.hop_size): if self.feature.name() == "MFCC": bands, coeffs = self.feature(self.spectrum(self.w(frame))) elif self.feature.name() == "HPCP": spectral_peaks = ES.SpectralPeaks() freqs, mags = spectral_peaks(self.spectrum(self.w(frame))) coeffs = self.feature(freqs, mags) features.append(coeffs) # Convert to Essentia Numpy array features = essentia.array(features) if self.beats != []: framerate = self.sample_rate / float(self.hop_size) tframes = np.arange(features.shape[0]) / float(framerate) features = utils.resample_mx(features.T, tframes, self.beats).T return features
def hpcp(self, frameSize=4096, windowType='blackmanharris62', harmonicsPerPeak=8, magnitudeThreshold=0, maxPeaks=100, whitening=True, referenceFrequency=440, minFrequency=100, maxFrequency=3500, nonLinear=False, numBins=12, display=False): """ Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using the default parameters as mentioned in [1]. Please refer to the following paper for detailed explanantion of the algorithm. [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. For full list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html Returns hpcp: ndarray(n_frames, 12) The HPCP coefficients at each time frame """ audio = array(self.audio_vector) frameGenerator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=self.hop_length) # framecutter = estd.FrameCutter(frameSize=frameSize, hopSize=self.hop_length) windowing = estd.Windowing(type=windowType) spectrum = estd.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/streaming_SpectralPeaks.html spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=magnitudeThreshold, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, orderBy="frequency", sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/streaming_SpectralWhitening.html spectralWhitening = estd.SpectralWhitening(maxFrequency= maxFrequency, sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/streaming_HPCP.html hpcp = estd.HPCP(sampleRate=self.fs, maxFrequency=maxFrequency, minFrequency=minFrequency, referenceFrequency=referenceFrequency, nonLinear=nonLinear, harmonics=harmonicsPerPeak, size=numBins) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(windowing(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp',hpcp_vector) if display: display_chroma(pool['tonal.hpcp'].T, self.hop_length) return pool['tonal.hpcp']
bandpreset = False normalize = False """ DSP: FrameGenerator -> Windowing -> Spectrum -> Spectral Peaks -> HPCP """ # window type = {hamming,hann,triangular,square,blackmanharris62,blackmanharris70,blackmanharris74,blackmanharris92} # perhaps HERE convert linear units to Db's?? loader = esst.MonoLoader(filename=filename, sampleRate=samplerate) window = esst.Windowing(type='blackmanharris92', size=framesize) rfft = esst.Spectrum(size=framesize) peaks = esst.SpectralPeaks(minFrequency=minfreq, maxFrequency=maxfreq, maxPeaks=maxpeaks, magnitudeThreshold=magthres, sampleRate=samplerate, orderBy=orderby) hpcp = esst.HPCP(bandPreset=bandpreset, harmonics=partials, normalized=normalize, minFrequency=minfreq, maxFrequency=maxfreq, sampleRate=samplerate, weightType=weight) audio = loader() peakF = [] peakA = [] chroma = []
def hpcpgram(audio, sampleRate=44100, frameSize=4096, hopSize=2048, numBins=12, windowType='blackmanharris62', minFrequency=100, maxFrequency=4000, whitening=False, maxPeaks=100, magnitudeThreshold=1e-05, **kwargs): """ Compute Harmonic Pitch Class Profile (HPCP) Grams for overlapped frames of a given input audio signal For additional list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html References: [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. Inputs audio (2d vector): audio signal Parameters: sampleRate : (real ∈ (0, ∞), default = 44100) : the sampling rate of the audio signal [Hz] frameSize (integer ∈ [1, ∞), default = 1024) : the output frame size hopSize (integer ∈ [1, ∞), default = 512) : the hop size between frames numBins : (integer ∈ [12, ∞), default = 12) : the size of the output HPCP (must be a positive nonzero multiple of 12) windowType (string ∈ {hamming, hann, hannnsgcq, triangular, square, blackmanharris62, blackmanharris70, blackmanharris74, blackmanharris92}, default = blackmanharris62) : the window type, which can be 'hamming', 'hann', 'triangular', 'square' or 'blackmanharrisXX' maxFrequency : (real ∈ (0, ∞), default = 4000) : the maximum frequency that contributes to the SpectralPeaks and HPCP algorithms computation [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz) minFrequency : (real ∈ (0, ∞), default = 100) : the minimum frequency that contributes to the SpectralPeaks and HPCP algorithm computation [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz) maxPeaks (integer ∈ [1, ∞), default = 100) : the maximum number of returned peaks while calculating SpectralPeaks magnitudeThreshold (real ∈ (-∞, ∞), default = 0) : peaks below this given threshold are not outputted while calculating Spectral Peaks whitening : (boolean (True, False), default = False) Optional step of computing spectral whitening to the output from speakPeak magnitudes kwargs : additional keyword arguments Arguments to parameterize HPCP alogithms. see standard mode HPCP algorithm (http://essentia.upf.edu/documentation/reference/std_HPCP.html). Returns: hpcpgram of overlapped frames of input audio signal (2D vector) """ frameGenerator = es.FrameGenerator(array(audio), frameSize=frameSize, hopSize=hopSize) window = es.Windowing(type=windowType) spectrum = es.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html spectralPeaks = es.SpectralPeaks(magnitudeThreshold=magnitudeThreshold, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, sampleRate=sampleRate) # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html spectralWhitening = es.SpectralWhitening(maxFrequency=maxFrequency, sampleRate=sampleRate) # http://essentia.upf.edu/documentation/reference/std_HPCP.html hpcp = es.HPCP(sampleRate=sampleRate, maxFrequency=maxFrequency, minFrequency=minFrequency, size=numBins, **kwargs) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(window(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp', hpcp_vector) return pool['tonal.hpcp']
def extractFeatures(audio_data): """ Recebe um vetor de reais representando um sinal de áudio, calcula suas features, agrega-as em uma Pool() de essentia e retorna esta Pool """ from numpy import ndarray assert (type(audio_data) is ndarray) assert ("float" in str(audio_data.dtype)) #Inicia Pool() output_pool = es.Pool() #Calcula espectro do sinal output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data)) #Calcula EnergyBandRatio energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum]) output_pool.set(pk_energy_band_ratio, energy_band_ratio) #Calcula MaxMagFreq max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum]) output_pool.set(pk_max_mag_freq, max_mag_freq) #Calcula SpectralCentroidTime spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data) output_pool.set(pk_spectral_centroid_time, spectral_centroid_time) #Calcula SpectralComplexity spectral_complexity = es_mode.SpectralComplexity()( output_pool[pk_spectrum]) output_pool.set(pk_spectral_complexity, spectral_complexity) #Calcula StrongPeak strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum]) output_pool.set(pk_strong_peak, strong_peak) #Calcula SpectralPeaks sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum]) #corta o DC, se houver, e pedido de HarmonicPeaks if sp_freq[0] == 0: sp_freq = sp_freq[1:] sp_mag = sp_mag[1:] output_pool.set(pk_spectral_peaks_freq, sp_freq) output_pool.set(pk_spectral_peaks_mag, sp_mag) ###################################### # Para Inharmonicity # ###################################### #Calcula PitchYinFFT pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()( output_pool[pk_spectrum]) output_pool.set(pk_pitch, pitch_yin_fft) output_pool.set(pk_pitch_prob, pitch_prob_yin_fft) #Calcula HarmonicPeaks hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag],\ output_pool[pk_pitch] ) output_pool.set(pk_harmonic_peaks_freq, hp_freq) output_pool.set(pk_harmonic_peaks_mag, hp_mag) #Calcula Inharmonicity inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\ output_pool[pk_harmonic_peaks_mag]) output_pool.set(pk_inharmonicity, inharmonicity) #Acaba Inharmonicity##################################### #Calcula SpectralContrast frame_size = 2 * (output_pool[pk_spectrum].size - 1) spectral_contrast, spectral_valley = \ es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum]) output_pool.set(pk_spectral_contrast, spectral_contrast) output_pool.set(pk_spectral_valley, spectral_valley) #Calcula SpectralWhitening spectral_whitening = \ es_mode.SpectralWhitening()(output_pool[pk_spectrum],\ output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag]) output_pool.set(pk_spectral_whitening, spectral_whitening) return output_pool
def estimate_key(input_audio_file, output_text_file=None, key_profile=None): """ This function estimates the overall key of an audio track optionaly with extra modal information. :type input_audio_file: str :type output_text_file: str """ if key_profile is not None: global USE_THREE_PROFILES global WITH_MODAL_DETAILS global KEY_PROFILE KEY_PROFILE = key_profile USE_THREE_PROFILES = False WITH_MODAL_DETAILS = False loader = estd.MonoLoader(filename=input_audio_file, sampleRate=SAMPLE_RATE) cut = estd.FrameCutter(frameSize=WINDOW_SIZE, hopSize=HOP_SIZE) window = estd.Windowing(size=WINDOW_SIZE, type=WINDOW_SHAPE) rfft = estd.Spectrum(size=WINDOW_SIZE) sw = estd.SpectralWhitening(maxFrequency=MAX_HZ, sampleRate=SAMPLE_RATE) speaks = estd.SpectralPeaks(magnitudeThreshold=SPECTRAL_PEAKS_THRESHOLD, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, maxPeaks=SPECTRAL_PEAKS_MAX, sampleRate=SAMPLE_RATE) hpcp = estd.HPCP( bandPreset=HPCP_BAND_PRESET, #bandSplitFrequency=HPCP_SPLIT_HZ, harmonics=HPCP_HARMONICS, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, nonLinear=HPCP_NON_LINEAR, normalized=HPCP_NORMALIZE, referenceFrequency=HPCP_REFERENCE_HZ, sampleRate=SAMPLE_RATE, size=HPCP_SIZE, weightType=HPCP_WEIGHT_TYPE, windowSize=HPCP_WEIGHT_WINDOW_SEMITONES, maxShifted=HPCP_SHIFT) if HIGHPASS_CUTOFF is not None: hpf = estd.HighPass(cutoffFrequency=HIGHPASS_CUTOFF, sampleRate=SAMPLE_RATE) audio = hpf(hpf(hpf(loader()))) else: audio = loader() duration = len(audio) n_slices = 1 + (duration // HOP_SIZE) chroma = np.empty([n_slices, HPCP_SIZE], dtype='float64') for slice_n in range(n_slices): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if SPECTRAL_WHITENING: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if not DETUNING_CORRECTION or DETUNING_CORRECTION_SCOPE == 'average': chroma[slice_n] = pcp elif DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'frame': pcp = shift_pcp(pcp, HPCP_SIZE) chroma[slice_n] = pcp else: raise NameError("SHIFT_SCOPE must be set to 'frame' or 'average'.") chroma = np.sum(chroma, axis=0) if PCP_THRESHOLD is not None: chroma = normalize_pcp_peak(chroma) chroma = pcp_gate(chroma, PCP_THRESHOLD) if DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'average': chroma = shift_pcp(chroma, HPCP_SIZE) chroma = np.roll( chroma, -3) # Adjust to essentia's HPCP calculation starting on A... if USE_THREE_PROFILES: estimation_1 = template_matching_3(chroma, KEY_PROFILE) else: estimation_1 = template_matching_2(chroma, KEY_PROFILE) key_1 = estimation_1[0] + '\t' + estimation_1[1] correlation_value = estimation_1[2] if WITH_MODAL_DETAILS: estimation_2 = template_matching_modal(chroma) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic tracks to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 if output_text_file is not None: textfile = open(output_text_file, 'w') textfile.write(key + '\t' + str(correlation_value) + '\n') textfile.close() return key, correlation_value
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ #help(ess.SpectralContrast) """ orig M = 1024 N = 1024 H = 512 fs = 44100 W = 'hann' """ """ freesound Real sampleRate = 44100; int frameSize = 2048; int hopSize = 1024; int zeroPadding = 0; string silentFrames ="noise"; string windowType = "blackmanharris62"; // Silence Rate Real thresholds_dB[] = { -20, -30, -60 }; vector<Real> thresholds(ARRAY_SIZE(thresholds_dB)); for (uint i=0; i<thresholds.size(); i++) { thresholds[i] = db2lin(thresholds_dB[i]/2.0); } """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #silentFrames = "noise" #thresholds_dB = np.array([ -20, -30, -60 ]) #thresholds = np.power (10.0, thresholds_dB / 20) #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pool = es.Pool() for frame in frames: mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) pool.add('lowlevel.mfcc', mfcc_coeffs) pool.add('lowlevel.mfcc_bands', mfcc_bands) pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) pool.add('lowlevel.dissonance', diss) pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) pool.add('sfx.inharmonicity', inharm) sc_coeffs, sc_valleys = spectral_contrast(mX) pool.add('lowlevel.spectral_contrast', sc_coeffs) c = centroid(mX) pool.add('lowlevel.spectral_centroid', c) lat = log_attack_time(frame) pool.add('sfx.logattacktime', lat) h = hfc(mX) pool.add('lowlevel.hfc', h) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) aggrPool = calc_Mean_Var(pool) features = makeFeatures(aggrPool) json.dump(features, open(outputJsonFile, 'w'))
plt.show() audio = es.MonoLoader(filename=filename)() rhythm_extractor = es.RhythmExtractor2013(method="multifeature") bpm, beats, beats_confidence, _, beats_intervals = rhythm_extractor(audio) print("bpm", bpm) bps = 2 #bpm / 60 print("bps", bps) hpcps = [] for b in range(int(track_length_sec * bps)): parts = 5 spectrum = None for i in range(parts): frame = audio[int(b * fs_rate + i * fs_rate / bps / parts):int( b * fs_rate + (i + 1) * fs_rate / bps / parts)] # for one beat # frame = audio[s *fs_rate: (s+1)* fs_rate] if spectrum is None: spectrum = es.Spectrum()(frame) else: spectrum += es.Spectrum()(frame) es_frequencies, es_magnitudes = es.SpectralPeaks()(spectrum) hpcp = es.HPCP()(es_frequencies, es_magnitudes) hpcps.append(hpcp) for h in hpcps: names = [ "a", "b", "h", "c", "cis", "d", "dis", "e", "f", "fis", "g", "gis" ] print([f"{name}-{v:0.2}" for name, v in zip(names, h) if v > 0.1]) chords = es.ChordsDetection()(essentia.array(hpcps)) print(chords)
def get_spectral_info(frame): """Gets spectrum frequencies and their magnitudes for a single frame""" spectrum = es.Spectrum(size=samples_per_frame)(frame) freqs, mags = es.SpectralPeaks(**peak_params)(spectrum) mags = es.SpectralWhitening()(spectrum, freqs, mags) return spectrum, freqs, mags
def compute(audio, pool, options): INFO('Computing SFX descriptors...') # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # pitch algorithm pitch_detection = ess.PitchYinFFT(frameSize=2048, sampleRate=sampleRate) # sfx descriptors spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() odd2evenharmonicenergyratio = ess.OddToEvenHarmonicEnergyRatio() tristimulus = ess.Tristimulus() # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 progress = Progress(total=total_frames) for frame in frames: frameScope = [ start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate ] # pool.setCurrentScope(frameScope) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) # spectral peaks based descriptors frame_frequencies, frame_magnitudes = spectral_peaks(frame_spectrum) # ERROR CORRECTION - hoinx 2015-12 errIdx = np.where(frame_frequencies < 1) frame_frequencies = np.delete(frame_frequencies, errIdx) frame_magnitudes = np.delete(frame_magnitudes, errIdx) (frame_harmonic_frequencies, frame_harmonic_magnitudes) = harmonic_peaks(frame_frequencies, frame_magnitudes, frame_pitch) if len(frame_harmonic_frequencies) > 1: frame_inharmonicity = inharmonicity(frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'inharmonicity', frame_inharmonicity) frame_tristimulus = tristimulus(frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'tristimulus', frame_tristimulus) frame_odd2evenharmonicenergyratio = odd2evenharmonicenergyratio( frame_harmonic_frequencies, frame_harmonic_magnitudes) pool.add(namespace + '.' + 'odd2evenharmonicenergyratio', frame_odd2evenharmonicenergyratio) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize envelope = ess.Envelope() file_envelope = envelope(audio) # temporal statistics decrease = ess.Decrease() pool.add(namespace + '.' + 'temporal_decrease', decrease(file_envelope)) # , pool.GlobalScope) centralmoments = ess.CentralMoments() file_centralmoments = centralmoments(file_envelope) distributionshape = ess.DistributionShape() (file_spread, file_skewness, file_kurtosis) = distributionshape(file_centralmoments) pool.add(namespace + '.' + 'temporal_spread', file_spread) # , pool.GlobalScope) pool.add(namespace + '.' + 'temporal_skewness', file_skewness) # , pool.GlobalScope) pool.add(namespace + '.' + 'temporal_kurtosis', file_kurtosis) # , pool.GlobalScope) centroid = ess.Centroid() pool.add(namespace + '.' + 'temporal_centroid', centroid(file_envelope)) # , pool.GlobalScope) # effective duration effectiveduration = ess.EffectiveDuration() pool.add(namespace + '.' + 'effective_duration', effectiveduration(file_envelope)) # , pool.GlobalScope) # log attack time logattacktime = ess.LogAttackTime() pool.add(namespace + '.' + 'logattacktime', logattacktime(audio)) # , pool.GlobalScope) # strong decay strongdecay = ess.StrongDecay() pool.add(namespace + '.' + 'strongdecay', strongdecay(file_envelope)) # , pool.GlobalScope) # dynamic profile flatness = ess.FlatnessSFX() pool.add(namespace + '.' + 'flatness', flatness(file_envelope)) # , pool.GlobalScope) """ # onsets number onsets_number = len(pool['rhythm.onset_times'][0]) pool.add(namespace + '.' + 'onsets_number', onsets_number) # , pool.GlobalScope) """ # morphological descriptors max_to_total = ess.MaxToTotal() pool.add(namespace + '.' + 'max_to_total', max_to_total(file_envelope)) # , pool.GlobalScope) tc_to_total = ess.TCToTotal() pool.add(namespace + '.' + 'tc_to_total', tc_to_total(file_envelope)) # , pool.GlobalScope) derivativeSFX = ess.DerivativeSFX() (der_av_after_max, max_der_before_max) = derivativeSFX(file_envelope) pool.add(namespace + '.' + 'der_av_after_max', der_av_after_max) # , pool.GlobalScope) pool.add(namespace + '.' + 'max_der_before_max', max_der_before_max) # , pool.GlobalScope) # pitch profile """ pitch = pool['lowlevel.pitch'] if len(pitch) > 1: pool.add(namespace + '.' + 'pitch_max_to_total', max_to_total(pitch)) # , pool.GlobalScope) min_to_total = ess.MinToTotal() pool.add(namespace + '.' + 'pitch_min_to_total', min_to_total(pitch)) # , pool.GlobalScope) pitch_centroid = ess.Centroid(range=len(pitch) - 1) pool.add(namespace + '.' + 'pitch_centroid', pitch_centroid(pitch)) # , pool.GlobalScope) pitch_after_max_to_before_max_energy_ratio = ess.AfterMaxToBeforeMaxEnergyRatio() pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', pitch_after_max_to_before_max_energy_ratio(pitch)) # , pool.GlobalScope) else: pool.add(namespace + '.' + 'pitch_max_to_total', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_min_to_total', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_centroid', 0.0) # , pool.GlobalScope) pool.add(namespace + '.' + 'pitch_after_max_to_before_max_energy_ratio', 0.0) # , pool.GlobalScope) """ progress.finish()
def key_ecir(input_audio_file, output_text_file, **kwargs): if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) key = estd.Key(numHarmonics=kwargs["KEY_HARMONICS"], pcpSize=kwargs["HPCP_SIZE"], profileType=kwargs["KEY_PROFILE"], slope=kwargs["KEY_SLOPE"], usePolyphony=kwargs["KEY_POLYPHONY"], useThreeChords=kwargs["KEY_USE_THREE_CHORDS"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) # p1 = frequencies; p2 = magnitudes if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) vector = hpcp(p1, p2) sum_vector = np.sum(vector) if sum_vector > 0: if kwargs["DETUNING_CORRECTION"] == False or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(vector) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': vector = _detuning_correction(vector, kwargs["HPCP_SIZE"]) chroma.append(vector) else: print("SHIFT_SCOPE must be set to 'frame' or 'average'") chroma = np.mean(chroma, axis=0) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) key = key(chroma.tolist()) confidence = (key[2], key[3]) key = key[0] + '\t' + key[1] textfile = open(output_text_file, 'w') textfile.write(key + '\n') textfile.close() return key, confidence
def compute(audio, pool, options): # analysis parameters sampleRate = options['sampleRate'] frameSize = options['frameSize'] hopSize = options['hopSize'] windowType = options['windowType'] # temporal descriptors lpc = ess.LPC(order=10, type='warped', sampleRate=sampleRate) zerocrossingrate = ess.ZeroCrossingRate() # frame algorithms frames = ess.FrameGenerator(audio=audio, frameSize=frameSize, hopSize=hopSize) window = ess.Windowing(size=frameSize, zeroPadding=0, type=windowType) spectrum = ess.Spectrum(size=frameSize) # spectral algorithms barkbands = ess.BarkBands(sampleRate=sampleRate) centralmoments = ess.CentralMoments() crest = ess.Crest() centroid = ess.Centroid() decrease = ess.Decrease() spectral_contrast = ess.SpectralContrast(frameSize=frameSize, sampleRate=sampleRate, numberBands=6, lowFrequencyBound=20, highFrequencyBound=11000, neighbourRatio=0.4, staticDistribution=0.15) distributionshape = ess.DistributionShape() energy = ess.Energy() # energyband_bass, energyband_middle and energyband_high parameters come from "standard" hi-fi equalizers energyband_bass = ess.EnergyBand(startCutoffFrequency=20.0, stopCutoffFrequency=150.0, sampleRate=sampleRate) energyband_middle_low = ess.EnergyBand(startCutoffFrequency=150.0, stopCutoffFrequency=800.0, sampleRate=sampleRate) energyband_middle_high = ess.EnergyBand(startCutoffFrequency=800.0, stopCutoffFrequency=4000.0, sampleRate=sampleRate) energyband_high = ess.EnergyBand(startCutoffFrequency=4000.0, stopCutoffFrequency=20000.0, sampleRate=sampleRate) flatnessdb = ess.FlatnessDB() flux = ess.Flux() harmonic_peaks = ess.HarmonicPeaks() hfc = ess.HFC() mfcc = ess.MFCC() rolloff = ess.RollOff() rms = ess.RMS() strongpeak = ess.StrongPeak() # pitch algorithms pitch_detection = ess.PitchYinFFT(frameSize=frameSize, sampleRate=sampleRate) pitch_salience = ess.PitchSalience() # dissonance spectral_peaks = ess.SpectralPeaks(sampleRate=sampleRate, orderBy='frequency') dissonance = ess.Dissonance() # spectral complexity # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) INFO('Computing Low-Level descriptors...') # used for a nice progress display total_frames = frames.num_frames() n_frames = 0 start_of_frame = -frameSize * 0.5 pitches, pitch_confidences = [], [] progress = Progress(total=total_frames) #scPool = es.Pool() # pool for spectral contrast for frame in frames: frameScope = [start_of_frame / sampleRate, (start_of_frame + frameSize) / sampleRate] # pool.setCurrentScope(frameScope) # silence rate # pool.add(namespace + '.' + 'silence_rate_60dB', es.isSilent(frame)) pool.add(namespace + '.' + 'silence_rate_60dB', is_silent_threshold(frame, -60)) pool.add(namespace + '.' + 'silence_rate_30dB', is_silent_threshold(frame, -30)) pool.add(namespace + '.' + 'silence_rate_20dB', is_silent_threshold(frame, -20)) if options['skipSilence'] and es.isSilent(frame): total_frames -= 1 start_of_frame += hopSize continue # temporal descriptors pool.add(namespace + '.' + 'zerocrossingrate', zerocrossingrate(frame)) (frame_lpc, frame_lpc_reflection) = lpc(frame) pool.add(namespace + '.' + 'temporal_lpc', frame_lpc) frame_windowed = window(frame) frame_spectrum = spectrum(frame_windowed) # spectrum-based descriptors power_spectrum = frame_spectrum ** 2 pool.add(namespace + '.' + 'spectral_centroid', centroid(power_spectrum)) pool.add(namespace + '.' + 'spectral_decrease', decrease(power_spectrum)) pool.add(namespace + '.' + 'spectral_energy', energy(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_low', energyband_bass(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_low', energyband_middle_low(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_middle_high', energyband_middle_high(frame_spectrum)) pool.add(namespace + '.' + 'spectral_energyband_high', energyband_high(frame_spectrum)) pool.add(namespace + '.' + 'hfc', hfc(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rms', rms(frame_spectrum)) pool.add(namespace + '.' + 'spectral_flux', flux(frame_spectrum)) pool.add(namespace + '.' + 'spectral_rolloff', rolloff(frame_spectrum)) pool.add(namespace + '.' + 'spectral_strongpeak', strongpeak(frame_spectrum)) # central moments descriptors frame_centralmoments = centralmoments(power_spectrum) (frame_spread, frame_skewness, frame_kurtosis) = distributionshape(frame_centralmoments) pool.add(namespace + '.' + 'spectral_kurtosis', frame_kurtosis) pool.add(namespace + '.' + 'spectral_spread', frame_spread) pool.add(namespace + '.' + 'spectral_skewness', frame_skewness) # dissonance (frame_frequencies, frame_magnitudes) = spectral_peaks(frame_spectrum) frame_dissonance = dissonance(frame_frequencies, frame_magnitudes) pool.add(namespace + '.' + 'dissonance', frame_dissonance) # mfcc (frame_melbands, frame_mfcc) = mfcc(frame_spectrum) pool.add(namespace + '.' + 'mfcc', frame_mfcc) # spectral contrast (sc_coeffs, sc_valleys) = spectral_contrast(frame_spectrum) #scPool.add(namespace + '.' + 'sccoeffs', sc_coeffs) #scPool.add(namespace + '.' + 'scvalleys', sc_valleys) pool.add(namespace + '.' + 'spectral_contrast', sc_coeffs) # barkbands-based descriptors frame_barkbands = barkbands(frame_spectrum) pool.add(namespace + '.' + 'barkbands', frame_barkbands) pool.add(namespace + '.' + 'spectral_crest', crest(frame_barkbands)) pool.add(namespace + '.' + 'spectral_flatness_db', flatnessdb(frame_barkbands)) barkbands_centralmoments = ess.CentralMoments(range=len(frame_barkbands) - 1) (barkbands_spread, barkbands_skewness, barkbands_kurtosis) = distributionshape( barkbands_centralmoments(frame_barkbands)) pool.add(namespace + '.' + 'barkbands_spread', barkbands_spread) pool.add(namespace + '.' + 'barkbands_skewness', barkbands_skewness) pool.add(namespace + '.' + 'barkbands_kurtosis', barkbands_kurtosis) # pitch descriptors frame_pitch, frame_pitch_confidence = pitch_detection(frame_spectrum) if frame_pitch > 0 and frame_pitch <= 20000.: pool.add(namespace + '.' + 'pitch', frame_pitch) pitches.append(frame_pitch) pitch_confidences.append(frame_pitch_confidence) pool.add(namespace + '.' + 'pitch_instantaneous_confidence', frame_pitch_confidence) frame_pitch_salience = pitch_salience(frame_spectrum[:-1]) pool.add(namespace + '.' + 'pitch_salience', frame_pitch_salience) # spectral complexity pool.add(namespace + '.' + 'spectral_complexity', spectral_complexity(frame_spectrum)) # display of progress report progress.update(n_frames) n_frames += 1 start_of_frame += hopSize # if no 'temporal_zerocrossingrate' it means that this is a silent file if 'zerocrossingrate' not in descriptorNames(pool.descriptorNames(), namespace): raise ess.EssentiaError('This is a silent file!') #spectralContrastPCA(scPool, pool) # build pitch value histogram from math import log from numpy import bincount # convert from Hz to midi notes midipitches = [] unknown = 0 for freq in pitches: if freq > 0. and freq <= 12600: midipitches.append(12 * (log(freq / 6.875) / 0.69314718055995) - 3.) else: unknown += 1 if len(midipitches) > 0: # compute histogram midipitchhist = bincount(midipitches) # set 0 midi pitch to be the number of pruned value midipitchhist[0] = unknown # normalise midipitchhist = [val / float(sum(midipitchhist)) for val in midipitchhist] # zero pad for i in range(128 - len(midipitchhist)): midipitchhist.append(0.0) else: midipitchhist = [0.] * 128 midipitchhist[0] = 1. # pitchhist = ess.array(zip(range(len(midipitchhist)), midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram', midipitchhist) # , pool.GlobalScope) # the code below is the same as the one above: # for note in midipitchhist: # pool.add(namespace + '.' + 'spectral_pitch_histogram_values', note) # print "midi note:", note pitch_centralmoments = ess.CentralMoments(range=len(midipitchhist) - 1) (pitch_histogram_spread, pitch_histogram_skewness, pitch_histogram_kurtosis) = distributionshape( pitch_centralmoments(midipitchhist)) pool.add(namespace + '.' + 'spectral_pitch_histogram_spread', pitch_histogram_spread) # , pool.GlobalScope) progress.finish()
def key_aes(input_audio_file, output_text_file, **kwargs): """ This function estimates the overall key of an audio track optionally with extra modal information. :type input_audio_file: str :type output_text_file: str """ if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if np.sum(pcp) > 0: if not kwargs["DETUNING_CORRECTION"] or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(pcp) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': pcp = _detuning_correction(pcp, kwargs["HPCP_SIZE"]) chroma.append(pcp) else: raise NameError( "SHIFT_SCOPE musts be set to 'frame' or 'average'.") if not chroma: return 'Silence' chroma = np.sum(chroma, axis=0) chroma = norm_peak(chroma) if kwargs["PCP_THRESHOLD"] is not None: chroma = vector_threshold(chroma, kwargs["PCP_THRESHOLD"]) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) # Adjust to essentia's HPCP calculation starting on A (pc = 9) chroma = np.roll(chroma, -3 * (kwargs["HPCP_SIZE"] // 12)) estimation_1 = estimate_key(chroma, kwargs["KEY_PROFILE"], kwargs["PROFILE_INTERPOLATION"], conf_thres=kwargs["NOKEY_THRESHOLD"], vocabulary=kwargs["KEY_VOCABULARY"]) key_1 = estimation_1[0] correlation_value = estimation_1[1] if kwargs["WITH_MODAL_DETAILS"]: estimation_2 = _key7(chroma, kwargs["PROFILE_INTERPOLATION"]) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic track to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 textfile = open(output_text_file, 'w') textfile.write(key) textfile.close() return key, correlation_value
# retrieve filenames from folder: soundfiles = os.listdir(audio_folder) if '.DS_Store' in soundfiles: soundfiles.remove('.DS_Store') # ANALYSIS print "\nANALYSIS..." for item in soundfiles: loader = estd.MonoLoader(filename=audio_folder + '/' +item, sampleRate=sample_rate) window = estd.Windowing(size=window_size, type="blackmanharris62") rfft = estd.Spectrum(size=window_size) speaks = estd.SpectralPeaks(orderBy="magnitude", magnitudeThreshold=magnitude_threshold, minFrequency=min_frequency, maxFrequency=max_frequency, maxPeaks=max_peaks, sampleRate=sample_rate) hpcp = estd.HPCP(bandPreset=band_preset, harmonics = harmonics, minFrequency=min_frequency, maxFrequency=max_frequency, nonLinear=non_linear, normalized=normalize, sampleRate=sample_rate, weightType=weight_type, windowSize=weight_window_size) key = estd.Key(numHarmonics=harmonics_key, slope=slope, usePolyphony=polyphony, useThreeChords=three_chords,
def reComputeDescriptors(inputAudioFile, outputJsonFile): """ :param inputAudioFile: :param outputJsonFile: :return: """ M = 2048 N = 2048 H = 1024 fs = 44100 W = 'blackmanharris62' #spectrum = ess.Spectrum(size=N) spectrum = ess.Spectrum() #window = ess.Windowing(size=M, type=W) window = ess.Windowing(type=W) #mfcc = ess.MFCC(numberCoefficients=12, inputSize=N/2+1) mfcc = ess.MFCC() spectral_peaks = ess.SpectralPeaks(minFrequency=1, maxFrequency=20000, maxPeaks=100, sampleRate=fs, magnitudeThreshold=0, orderBy="magnitude") dissonance = ess.Dissonance() #pitch_detection = ess.PitchYinFFT(frameSize=M, sampleRate=fs) pitch_detection = ess.PitchYinFFT() harmonic_peaks = ess.HarmonicPeaks() inharmonicity = ess.Inharmonicity() #spectral_contrast = ess.SpectralContrast(sampleRate=fs) spectral_contrast = ess.SpectralContrast() centroid = ess.Centroid() log_attack_time = ess.LogAttackTime() hfc = ess.HFC() # magnitudeThreshold = 0.005 is hardcoded for a "blackmanharris62" frame, see lowlevel.py spectral_complexity = ess.SpectralComplexity(magnitudeThreshold=0.005) energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) E = [] numFrames = 0 for frame in frames: numFrames += 1 E_frame = energy(frame) E.append(E_frame) E_max = np.max(E) frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) pools = [(t, es.Pool()) for t in dscr.threshold] for frame in frames: eNorm = energy(frame) / E_max threshPools = [] for t, pool in pools: if eNorm >= t: threshPools.append(pool) mX = spectrum(window(frame)) mfcc_bands, mfcc_coeffs = mfcc(mX) [pool.add('lowlevel.mfcc', mfcc_coeffs) for pool in threshPools] #[pool.add('lowlevel.mfcc_bands', mfcc_bands) for pool in threshPools] pfreq, pmag = spectral_peaks(mX) inds = pfreq.argsort() pfreq_sorted = pfreq[inds] pmag_sorted = pmag[inds] diss = dissonance(pfreq_sorted, pmag_sorted) [pool.add('lowlevel.dissonance', diss) for pool in threshPools] pitch, pitch_confidence = pitch_detection(mX) phfreq, phmag = harmonic_peaks(pfreq_sorted, pmag_sorted, pitch) if len(phfreq) > 1: inharm = inharmonicity(phfreq, phmag) [pool.add('sfx.inharmonicity', inharm) for pool in threshPools] sc_coeffs, sc_valleys = spectral_contrast(mX) [pool.add('lowlevel.spectral_contrast', sc_coeffs) for pool in threshPools] c = centroid(mX) [pool.add('lowlevel.spectral_centroid', c) for pool in threshPools] lat = log_attack_time(frame) [pool.add('sfx.logattacktime', lat) for pool in threshPools] h = hfc(mX) [pool.add('lowlevel.hfc', h) for pool in threshPools] spec_complx = spectral_complexity(mX) [pool.add('lowlevel.spectral_complexity', spec_complx) for pool in threshPools] #calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean', 'var']) calc_Mean_Var = ess.PoolAggregator(defaultStats=['mean']) aggrPools = [calc_Mean_Var(pool) for t, pool in pools] features = {} [appendFeatures(features, aggrPools[i], ("ethc"+str(dscr.thresholdSelect[i]))) for i in range(len(aggrPools))] json.dump(features, open(outputJsonFile, 'w'))
def key_detector(): reloj() # create directory to write the results with an unique time id: if results_to_file or results_to_csv: uniqueTime = str(int(tiempo())) wd = os.getcwd() temp_folder = wd + '/KeyDetection_' + uniqueTime os.mkdir(temp_folder) if results_to_csv: import csv csvFile = open(temp_folder + '/Estimation_&_PCP.csv', 'w') lineWriter = csv.writer(csvFile, delimiter=',') # retrieve files and filenames according to the desired settings: if analysis_mode == 'title': allfiles = os.listdir(audio_folder) if '.DS_Store' in allfiles: allfiles.remove('.DS_Store') for item in collection: collection[collection.index(item)] = ' > ' + item + '.' for item in genre: genre[genre.index(item)] = ' < ' + item + ' > ' for item in modality: modality[modality.index(item)] = ' ' + item + ' < ' analysis_files = [] for item in allfiles: if any(e1 for e1 in collection if e1 in item): if any(e2 for e2 in genre if e2 in item): if any(e3 for e3 in modality if e3 in item): analysis_files.append(item) song_instances = len(analysis_files) print song_instances, 'songs matching the selected criteria:' print collection, genre, modality if limit_analysis == 0: pass elif limit_analysis < song_instances: analysis_files = sample(analysis_files, limit_analysis) print "taking", limit_analysis, "random samples...\n" else: analysis_files = os.listdir(audio_folder) if '.DS_Store' in analysis_files: analysis_files.remove('.DS_Store') print len(analysis_files), '\nsongs in folder.\n' groundtruth_files = os.listdir(groundtruth_folder) if '.DS_Store' in groundtruth_files: groundtruth_files.remove('.DS_Store') # ANALYSIS # ======== if verbose: print "ANALYSING INDIVIDUAL SONGS..." print "=============================" if confusion_matrix: matrix = 24 * 24 * [0] mirex_scores = [] for item in analysis_files: # INSTANTIATE ESSENTIA ALGORITHMS # =============================== loader = estd.MonoLoader(filename=audio_folder + '/' + item, sampleRate=sample_rate) cut = estd.FrameCutter(frameSize=window_size, hopSize=hop_size) window = estd.Windowing(size=window_size, type=window_type) rfft = estd.Spectrum(size=window_size) sw = estd.SpectralWhitening(maxFrequency=max_frequency, sampleRate=sample_rate) speaks = estd.SpectralPeaks(magnitudeThreshold=magnitude_threshold, maxFrequency=max_frequency, minFrequency=min_frequency, maxPeaks=max_peaks, sampleRate=sample_rate) hpcp = estd.HPCP(bandPreset=band_preset, harmonics=harmonics, maxFrequency=max_frequency, minFrequency=min_frequency, nonLinear=non_linear, normalized=normalize, referenceFrequency=reference_frequency, sampleRate=sample_rate, size=hpcp_size, splitFrequency=split_frequency, weightType=weight_type, windowSize=weight_window_size) key = estd.Key(numHarmonics=num_harmonics, pcpSize=hpcp_size, profileType=profile_type, slope=slope, usePolyphony=use_polyphony, useThreeChords=use_three_chords) # ACTUAL ANALYSIS # =============== audio = loader() duration = len(audio) if skip_first_minute and duration > (sample_rate * 60): audio = audio[sample_rate * 60:] duration = len(audio) if first_n_secs > 0: if duration > (first_n_secs * sample_rate): audio = audio[:first_n_secs * sample_rate] duration = len(audio) if avoid_edges > 0: initial_sample = (avoid_edges * duration) / 100 final_sample = duration - initial_sample audio = audio[initial_sample:final_sample] duration = len(audio) number_of_frames = duration / hop_size chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) # p1 are frequencies; p2 magnitudes if spectral_whitening: p2 = sw(spek, p1, p2) vector = hpcp(p1, p2) sum_vector = np.sum(vector) if sum_vector > 0: if shift_spectrum == False or shift_scope == 'average': chroma.append(vector) elif shift_spectrum and shift_scope == 'frame': vector = shift_vector(vector, hpcp_size) chroma.append(vector) else: print "shift_scope must be set to 'frame' or 'average'" chroma = np.mean(chroma, axis=0) if shift_spectrum and shift_scope == 'average': chroma = shift_vector(chroma, hpcp_size) estimation = key(chroma.tolist()) result = estimation[0] + ' ' + estimation[1] confidence = estimation[2] if results_to_csv: chroma = list(chroma) # MIREX EVALUATION: # ================ if analysis_mode == 'title': ground_truth = item[item.find(' = ') + 3:item.rfind(' < ')] if verbose and confidence < confidence_threshold: print item[:item.rfind(' = ')] print 'G:', ground_truth, '|| P:', if results_to_csv: title = item[:item.rfind(' = ')] lineWriter.writerow([ title, ground_truth, chroma[0], chroma[1], chroma[2], chroma[3], chroma[4], chroma[5], chroma[6], chroma[7], chroma[8], chroma[9], chroma[10], chroma[11], chroma[12], chroma[13], chroma[14], chroma[15], chroma[16], chroma[17], chroma[18], chroma[19], chroma[20], chroma[21], chroma[22], chroma[23], chroma[24], chroma[25], chroma[26], chroma[27], chroma[28], chroma[29], chroma[30], chroma[31], chroma[32], chroma[33], chroma[34], chroma[35], result ]) ground_truth = key_to_list(ground_truth) estimation = key_to_list(result) score = mirex_score(ground_truth, estimation) mirex_scores.append(score) else: filename_to_match = item[:item.rfind('.')] + '.txt' print filename_to_match if filename_to_match in groundtruth_files: groundtruth_file = open( groundtruth_folder + '/' + filename_to_match, 'r') ground_truth = groundtruth_file.readline() if "\t" in ground_truth: ground_truth = re.sub("\t", " ", ground_truth) if results_to_csv: lineWriter.writerow([ filename_to_match, chroma[0], chroma[1], chroma[2], chroma[3], chroma[4], chroma[5], chroma[6], chroma[7], chroma[8], chroma[9], chroma[10], chroma[11], chroma[12], chroma[13], chroma[14], chroma[15], chroma[16], chroma[17], chroma[18], chroma[19], chroma[20], chroma[21], chroma[22], chroma[23], chroma[24], chroma[25], chroma[26], chroma[27], chroma[28], chroma[29], chroma[30], chroma[31], chroma[32], chroma[33], chroma[34], chroma[35], result ]) ground_truth = key_to_list(ground_truth) estimation = key_to_list(result) score = mirex_score(ground_truth, estimation) mirex_scores.append(score) else: print "FILE NOT FOUND... Skipping it from evaluation.\n" continue # CONFUSION MATRIX: # ================ if confusion_matrix: xpos = (ground_truth[0] + (ground_truth[0] * 24)) + (-1 * (ground_truth[1] - 1) * 24 * 12) ypos = ((estimation[0] - ground_truth[0]) + (-1 * (estimation[1] - 1) * 12)) matrix[(xpos + ypos)] = +matrix[(xpos + ypos)] + 1 if verbose and confidence < confidence_threshold: print result, '(%.2f)' % confidence, '|| SCORE:', score, '\n' # WRITE RESULTS TO FILE: # ===================== if results_to_file: with open(temp_folder + '/' + item[:-3] + 'txt', 'w') as textfile: textfile.write(result) textfile.close() if results_to_csv: csvFile.close() print len(mirex_scores), "files analysed in", reloj(), "secs.\n" if confusion_matrix: matrix = np.matrix(matrix) matrix = matrix.reshape(24, 24) print matrix if results_to_file: np.savetxt( temp_folder + '/_confusion_matrix.csv', matrix, fmt='%i', delimiter=',', header= 'C,C#,D,Eb,E,F,F#,G,G#,A,Bb,B,Cm,C#m,Dm,Ebm,Em,Fm,F#m,Gm,G#m,Am,Bbm,Bm' ) # MIREX RESULTS # ============= evaluation_results = mirex_evaluation(mirex_scores) # WRITE INFO TO FILE # ================== if results_to_file: settings = "SETTINGS\n========\nAvoid edges ('%' of duration disregarded at both ends (0 = complete)) = " + str( avoid_edges ) + "\nfirst N secs = " + str( first_n_secs ) + "\nshift spectrum to fit tempered scale = " + str( shift_spectrum ) + "\nspectral whitening = " + str( spectral_whitening ) + "\nsample rate = " + str(sample_rate) + "\nwindow size = " + str( window_size ) + "\nhop size = " + str(hop_size) + "\nmagnitude threshold = " + str( magnitude_threshold ) + "\nminimum frequency = " + str( min_frequency ) + "\nmaximum frequency = " + str( max_frequency ) + "\nmaximum peaks = " + str(max_peaks) + "\nband preset = " + str( band_preset ) + "\nsplit frequency = " + str( split_frequency ) + "\nharmonics = " + str(harmonics) + "\nnon linear = " + str( non_linear ) + "\nnormalize = " + str( normalize ) + "\nreference frequency = " + str( reference_frequency ) + "\nhpcp size = " + str( hpcp_size ) + "\nweigth type = " + weight_type + "\nweight window size in semitones = " + str( weight_window_size ) + "\nharmonics key = " + str(num_harmonics) + "\nslope = " + str( slope) + "\nprofile = " + profile_type + "\npolyphony = " + str( use_polyphony) + "\nuse three chords = " + str( use_three_chords) results_for_file = "\n\nEVALUATION RESULTS\n==================\nCorrect: " + str( evaluation_results[0]) + "\nFifth: " + str( evaluation_results[1]) + "\nRelative: " + str( evaluation_results[2]) + "\nParallel: " + str( evaluation_results[3]) + "\nError: " + str( evaluation_results[4]) + "\nWeighted: " + str( evaluation_results[5]) write_to_file = open(temp_folder + '/_SUMMARY.txt', 'w') write_to_file.write(settings) write_to_file.write(results_for_file) if analysis_mode == 'title': corpus = "\n\nANALYSIS CORPUS\n===============\n" + str( collection) + '\n' + str( genre) + '\n' + str(modality) + '\n\n' + str( len(mirex_scores)) + " files analysed.\n" write_to_file.write(corpus) write_to_file.close()
# Temporal descriptors power = es.InstantPower() log_attack_time = es.LogAttackTime() effective_duration = es.EffectiveDuration() auto_correlation = es.AutoCorrelation() zero_crossing_rate = es.ZeroCrossingRate() # Spectral descriptors peak_freq = es.MaxMagFreq() roll_off = es.RollOff() flux = es.Flux() flatness = es.Flatness() # Harmonic descriptors pitch = es.PitchYin(frameSize=1024) spectral_peaks = es.SpectralPeaks(minFrequency=1e-5) harmonic_peaks = es.HarmonicPeaks() inharmonicity = es.Inharmonicity() oer = es.OddToEvenHarmonicEnergyRatio() tristimulus = es.Tristimulus() # MFCC mfcc = es.MFCC(inputSize=513) class Audio: def __init__(self, path): self.audio = es.MonoLoader(filename=str(path))() self.name = path.name self.pool = essentia.Pool()
def computeLoudness(audioFile, outputExt='.loudness', f0=-1, HopSize=0.01, FrameSize=0.04643990929, BinResolution=10, GuessUnvoiced=True, VoicingTolerance=0.2, MaxFrequency=20000, interpolateLoudness=0, maxSilDurIntp=0.25, smoothLoudness=0): """ This function computes loudness (represented by energy) of the predominant source assuming either you have provided pitch of the predominant melodic source or if f0=-1, it uses Essentia-Melodia to estimate pitch of the predominant melodic source and uses harmonic detection to compute energy (treated as loudness). Any sudden gap in the harmonic magnitudes (undetected harmonics) which span a continous time duration < maxSilDurIntp will be interpolated. You should set this value exactly the same you used for interpolating pitch sequence to accound for short intra pattern pauses. """ #reading audio file fs = 44100.0 #ES.AudioLoader(filename = audioFile)()[1] audio = ES.MonoLoader(filename=audioFile, sampleRate=fs)() #obtaining just the file name and splitting extionsion fname, ext = os.path.splitext(audioFile) frameNSamples = np.round(FrameSize * fs).astype(np.int) frameNSamples = frameNSamples + np.mod(frameNSamples, 2) #checking the cases, possible types of input parameter f0 if type(f0) == int: #if its an integer (which essentially means the user has not provided any input and its -1), run the predominant melody estimation and obtain pitch estimate pitch = ES.PredominantPitchMelodia(hopSize=np.round(HopSize * fs).astype(np.int), frameSize=frameNSamples, binResolution=BinResolution, guessUnvoiced=GuessUnvoiced, voicingTolerance=VoicingTolerance, maxFrequency=MaxFrequency, minFrequency=60)(audio)[0] if type(f0) == str: #if its a string that means a user has provided input file name of the pitch file stored in the format <time stamps><pitch value> pitch = np.loadtxt(f0)[:, 1] if type(f0) == np.ndarray: # if its an ndarray, this means that the given sequence is the pitch sequence to be used for loudness computation pitch = f0 #creating algorithm objects to be used for harmonic detection for each audio frame NFFT = (2**np.ceil(np.log2(frameNSamples) + 1)).astype(np.int) WIN = ES.Windowing() SPECTRUM = ES.Spectrum() EQUALLOUD = ES.EqualLoudness() SPECPEAKS = ES.SpectralPeaks(sampleRate=fs, maxFrequency=8000) HARMDET = ES.HarmonicPeaks(maxHarmonics=30) audio_in = EQUALLOUD(audio) cnt = 0 harmWghts = [] for frame in ES.FrameGenerator(audio_in, frameSize=frameNSamples, hopSize=np.round(HopSize * fs).astype( np.int)): if cnt >= len(pitch): break spec = SPECTRUM(WIN(frame)) peaks = SPECPEAKS(spec) #sometimes the first frequency peak corresponds to 0Hz (DC offset), adding correction for that. p_freq = peaks[0] p_mags = peaks[1] if len(p_freq) > 0 and p_freq[0] == 0: p_freq = p_freq[1:] p_mags = p_mags[1:] wghtsLocal = HARMDET(p_freq, p_mags, pitch[cnt])[1] harmWghts.append(wghtsLocal) cnt += 1 if interpolateLoudness == 1: #interpolating harmonic weights harmWghts = np.array(harmWghts) harmWghtsIntrp = np.zeros(harmWghts.shape) for ii in range(harmWghts.shape[1]): harmWghts_temp = InterpolateSilence(harmWghts[:, ii], 0, HopSize, maxSilDurIntp) harmWghtsIntrp[:, ii] = harmWghts_temp else: harmWghtsIntrp = harmWghts loudness = [] for wghtsLocal in harmWghtsIntrp: indValid = np.where(wghtsLocal > 0)[0] loudness.append(np.sqrt(np.sum(np.power(wghtsLocal[indValid], 2)))) if interpolateLoudness == 1: loudness = InterpolateSilence(loudness, 0, HopSize, maxSilDurIntp) if smoothLoudness == 1: loudness = medfilt(loudness, np.round(50.0 / (HopSize * 1000)).astype(np.int)) #generating time stamps (because its equally hopped) TStamps = np.array(range(0, len(loudness))) * np.float(HopSize) dump = np.array([TStamps, loudness]).transpose() np.savetxt(fname + outputExt, dump, delimiter="\t")
def chroma_hpcp(self, frameSize=4096, hopSize=2048, windowType='blackmanharris62', harmonicsPerPeak=8, magnitudeThreshold=1e-05, maxPeaks=1000, whitening=True, referenceFrequency=440, minFrequency=40, maxFrequency=5000, nonLinear=False, numBins=12, display=False): ''' Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using the default parameters as mentioned in [1]. Please refer to the following paper for detailed explanantion of the algorithm. [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. For full list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html Parameters harmonicsPerPeak : (integer ∈ [0, ∞), default = 0) : number of harmonics for frequency contribution, 0 indicates exclusive fundamental frequency contribution maxFrequency : (real ∈ (0, ∞), default = 5000) : the maximum frequency that contributes to the HPCP [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz) minFrequency : (real ∈ (0, ∞), default = 40) : the minimum frequency that contributes to the HPCP [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz) nonLinear : (bool ∈ {true, false}, default = false) : apply non-linear post-processing to the output (use with normalized='unitMax'). Boosts values close to 1, decreases values close to 0. normalized (string ∈ {none, unitSum, unitMax}, default = unitMax) : whether to normalize the HPCP vector referenceFrequency : (real ∈ (0, ∞), default = 440) : the reference frequency for semitone index calculation, corresponding to A3 [Hz] sampleRate : (real ∈ (0, ∞), default = 44100) : the sampling rate of the audio signal [Hz] numBins : (integer ∈ [12, ∞), default = 12) : the size of the output HPCP (must be a positive nonzero multiple of 12) whitening : (boolean (True, False), default = False) Optional step of computing spectral whitening to the output from speakPeak magnitudes ''' audio = array(self.audio_vector) #print audio.shape frameGenerator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) window = estd.Windowing(type=windowType) spectrum = estd.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=0, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, orderBy="frequency", sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html spectralWhitening = estd.SpectralWhitening(maxFrequency=maxFrequency, sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/std_HPCP.html hpcp = estd.HPCP(sampleRate=self.fs, maxFrequency=maxFrequency, minFrequency=minFrequency, referenceFrequency=referenceFrequency, nonLinear=nonLinear, harmonics=harmonicsPerPeak, size=numBins) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(window(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp', hpcp_vector) if display: display_chroma(np.swapaxes(pool['tonal.hpcp']), 0, 1) return pool['tonal.hpcp']
import essentia.standard as ess # matplotlib without any blocking GUI import matplotlib as mpl mpl.use('Agg') import matplotlib.pyplot as plt import numpy as np M = 1024 N = 1024 H = 512 fs = 44100 spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') spectralPeaks = ess.SpectralPeaks() hpcp = ess.HPCP() x = ess.MonoLoader(filename='../../../sounds/cello-double.wav', sampleRate=fs)() hpcps = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): mX = spectrum(window(frame)) spectralPeaks_freqs, spectralPeaks_mags = spectralPeaks(mX) hpcp_vals = hpcp(spectralPeaks_freqs, spectralPeaks_mags) hpcps.append(hpcp_vals) hpcps = np.array(hpcps) plt.figure(1, figsize=(9.5, 7)) plt.subplot(2, 1, 1) plt.plot(np.arange(x.size) / float(fs), x, 'b')
from pylab import * from numpy import * from smst.models import stft filename = '../../../sounds/carnatic.wav' hopSize = 128 frameSize = 2048 sampleRate = 44100 guessUnvoiced = True run_windowing = ess.Windowing(type='hann', zeroPadding=3 * frameSize) # Hann window with x4 zero padding run_spectrum = ess.Spectrum(size=frameSize * 4) run_spectral_peaks = ess.SpectralPeaks(minFrequency=50, maxFrequency=10000, maxPeaks=100, sampleRate=sampleRate, magnitudeThreshold=0, orderBy="magnitude") run_pitch_salience_function = ess.PitchSalienceFunction(magnitudeThreshold=60) run_pitch_salience_function_peaks = ess.PitchSalienceFunctionPeaks(minFrequency=90, maxFrequency=800) run_pitch_contours = ess.PitchContours(hopSize=hopSize, peakFrameThreshold=0.7) run_pitch_contours_melody = ess.PitchContoursMelody(guessUnvoiced=guessUnvoiced, hopSize=hopSize) pool = essentia.Pool(); audio = ess.MonoLoader(filename=filename)() audio = ess.EqualLoudness()(audio) for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): frame = run_windowing(frame)
def __init__(self, bassline_filename, drum_filename, grid, frame_size, hop_size, fft_size, sample_rate, xlim, beats=[], onsets=[], bassline_onsets=[], midi_tracks=[], drum_analysisResults=[], parent=None, group_title="TRANSCRIBER", prefix_text=[], PYIN_midi=[], YIN_times=[]): self._width = 12 self._height = 7 self._dpi = 100 # self.drum_analysisResults = drum_analysisResults self.grid = grid # Create QGroupBox and set the parent canvas (if any) QtWidgets.QGroupBox.__init__(self, group_title) self.setParent(parent) self.tabs_widget = QtWidgets.QTabWidget(self) if bassline_filename: self.bassline_widget = QtWidgets.QWidget(self) self.chroma_widget = QtWidgets.QWidget(self) self.tabs_widget.addTab(self.bassline_widget, "Spectrogram") self.tabs_widget.addTab(self.chroma_widget, "Chroma") self.saveFileName = os.path.join( os.path.dirname(bassline_filename), prefix_text + ".txt") if drum_filename: self.drum_spectrogram_widget = QtWidgets.QWidget(self) self.drum_widget = QtWidgets.QWidget(self) self.tabs_widget.addTab(self.drum_spectrogram_widget, "drums spectrogram") self.tabs_widget.addTab( self.drum_widget, "drums transcription (onsets in band bands)") self.saveFileName = os.path.join(os.path.dirname(drum_filename), prefix_text + ".txt") self.main_layout = QtWidgets.QGridLayout(self) self.main_layout.setAlignment(Qt.AlignCenter) self.resize(1200, 800) self.main_layout.addWidget(self.tabs_widget, 0, 0) self.main_layout.setColumnStretch(0, 1) self.main_layout.setRowStretch(0, 1) # ------ --------- --------- Drum Spectrogram ------ STARTS HERE if drum_filename: options = { "filename": drum_filename, "fft_size": fft_size, "frame_size": frame_size, "hop_size": hop_size, "sample_rate": sample_rate, "xlim": xlim, "ylim": (20, 500), "width": self._width, "height": self._height, "dpi": self._dpi, "y_isHz": False, "playable": False, "saveFilename": self.saveFileName } self.DrumSpectrogramCanvas = SpectrogramCanvas( parent=self.drum_spectrogram_widget, **options) # Draw Beats for beat in beats: self.DrumSpectrogramCanvas.get_stft_ax().axvline(x=beat, ymin=0, ymax=1000, color='g') # Draw Onsets for onset in onsets: self.DrumSpectrogramCanvas.get_stft_ax().scatter(onset, 50, c='red', marker='o') # ------ --------- --------- Drum Spectrogram ------ ENDS HERE # ------ --------- --------- Drum Transcription ------ STARTS HERE self.drum_onset_dots = [] self.DrumCanvas = DrumCanvas(parent=self.drum_widget, **options) # connect save button self.key_pressed_cid = self.DrumCanvas.get_fig( ).canvas.mpl_connect('key_press_event', self.on_drum_key_press) if self.drum_analysisResults: self.draw_drum_results(self.DrumCanvas.get_fig(), self.DrumCanvas.get_ax()) for grid_line in self.grid: self.DrumCanvas.get_ax().axvline(x=grid_line, ymin=0, ymax=1000, color='b') for beat in beats: self.DrumCanvas.get_ax().axvline(x=beat, ymin=0, ymax=1000, color='g') for onset in onsets: self.DrumCanvas.get_ax().scatter(onset, 0.5, c='red', marker='o') self.bassline_filename = None # ------ --------- --------- Drum Transcription ------ ENDS HERE # ------ --------- --------- Bassline Transcription ------ STARTS HERE if bassline_filename: options = { "filename": bassline_filename, "fft_size": fft_size, "frame_size": frame_size, "hop_size": hop_size, "sample_rate": sample_rate, "xlim": xlim, "ylim": (20, 500), "width": self._width, "height": self._height, "dpi": self._dpi, "y_isHz": False, "playable": False, "saveFilename": self.saveFileName } self.BasslineCanvas = SpectrogramCanvas( parent=self.bassline_widget, **options) self.chromaCanvas = ChromaCanvas(parent=self.chroma_widget, **options) # Load Audio for chroma calculations loader = es.MonoLoader(filename=bassline_filename, sampleRate=sample_rate) self.audio = loader() xvals = np.arange(len(self.audio)) / float(sample_rate) xlim = [0, max(xvals) + .25] self.chromaCanvas.get_ax().set_xlim(xlim) # Calculate Chromagram self.chromagram = [] hpcp = es.HPCP( size=12, # we will need higher resolution for Key estimation referenceFrequency=440, # assume tuning frequency is 44100. bandPreset=False, weightType='cosine', nonLinear=False, windowSize=1., sampleRate=sample_rate) spectrum = es.Spectrum(size=fft_size) spectral_peaks = es.SpectralPeaks(sampleRate=sample_rate) for frame in es.FrameGenerator(self.audio, frameSize=8192, hopSize=hop_size, startFromZero=True): frame = array(frame * get_window("hann", 8192)) freqs, mags = spectral_peaks(spectrum(frame)) chroma = hpcp(freqs, mags) self.chromagram.append(chroma) self.chromagram = array(self.chromagram) self.timeAxSec = np.arange(len( self.chromagram)) * hop_size / (sample_rate) # plot chromagram pitchClasses = [ "A", "A#", "B", "C", "C#", "D", "D#", "E", "F", "F#", "G", "G#" ] self.chromaCanvas.get_ax().cla() self.chromaCanvas.get_ax().set_xlim(xlim) self.chromaCanvas.get_ax().set_ylim(*(-1, 13)) y_ax = np.arange(13) self.chromaCanvas.get_ax().set_yticks(y_ax[:12] + .5) self.chromaCanvas.get_ax().set_yticklabels(pitchClasses) self.chromaCanvas.get_ax().pcolormesh(self.timeAxSec, y_ax, self.chromagram.T) self.chromaCanvas.get_ax().set_ylabel("Pitch Class") self.chromaCanvas.get_fig().canvas.draw() self.InteractiveCanvas = MidiCanvas( parent=self.bassline_widget, ax=self.BasslineCanvas.get_stft_ax(), fig=self.BasslineCanvas.get_stft_fig(), horizontal_snap_grid=grid, snapVerticallyFlag=True, snap_offset_flag=True, doubleClickColor="y", xlim=xlim, ylim=(20, 500), width=self._width, height=self._height, dpi=self._dpi, x_sensitivity=.02, y_sensitivity=5, standalone=False, y_isHz=False, midi_tracks=midi_tracks, filename=bassline_filename, ax_chroma=self.chromaCanvas.get_ax(), fig_chroma=self.chromaCanvas.get_fig(), saveFileName=self.saveFileName) # Draw Beats for beat in beats: self.BasslineCanvas.get_stft_ax().axvline(x=beat, ymin=0, ymax=1000, color='g') self.chromaCanvas.get_ax().axvline(x=beat, ymin=0, ymax=1000, color='g') # Draw Onsets for onset in bassline_onsets: self.BasslineCanvas.get_stft_ax().scatter(onset, 50, c='red', marker='o') if PYIN_midi != []: self.BasslineCanvas.get_stft_ax().plot(YIN_times, PYIN_midi) self.BasslineCanvas.get_stft_ax().set_title( "Green: Beats, Red: Onsets, Blue: Grid") self.chromaCanvas.get_ax().set_title("Green: Beats") # show canvases self.BasslineCanvas.get_stft_fig().canvas.show() self.chromaCanvas.get_fig().canvas.show() # ------ --------- --------- Bassline Transcription ------ ENDS HERE self.show()