def hpcpgram(audio, sampleRate=44100, frameSize=4096, hopSize=2048, numBins=12, windowType='blackmanharris62', minFrequency=100, maxFrequency=4000, whitening=False, maxPeaks=100, magnitudeThreshold=1e-05, **kwargs): """ Compute Harmonic Pitch Class Profile (HPCP) Grams for overlapped frames of a given input audio signal For additional list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html References: [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. Inputs audio (2d vector): audio signal Parameters: sampleRate : (real ∈ (0, ∞), default = 44100) : the sampling rate of the audio signal [Hz] frameSize (integer ∈ [1, ∞), default = 1024) : the output frame size hopSize (integer ∈ [1, ∞), default = 512) : the hop size between frames numBins : (integer ∈ [12, ∞), default = 12) : the size of the output HPCP (must be a positive nonzero multiple of 12) windowType (string ∈ {hamming, hann, hannnsgcq, triangular, square, blackmanharris62, blackmanharris70, blackmanharris74, blackmanharris92}, default = blackmanharris62) : the window type, which can be 'hamming', 'hann', 'triangular', 'square' or 'blackmanharrisXX' maxFrequency : (real ∈ (0, ∞), default = 4000) : the maximum frequency that contributes to the SpectralPeaks and HPCP algorithms computation [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz) minFrequency : (real ∈ (0, ∞), default = 100) : the minimum frequency that contributes to the SpectralPeaks and HPCP algorithm computation [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz) maxPeaks (integer ∈ [1, ∞), default = 100) : the maximum number of returned peaks while calculating SpectralPeaks magnitudeThreshold (real ∈ (-∞, ∞), default = 0) : peaks below this given threshold are not outputted while calculating Spectral Peaks whitening : (boolean (True, False), default = False) Optional step of computing spectral whitening to the output from speakPeak magnitudes kwargs : additional keyword arguments Arguments to parameterize HPCP alogithms. see standard mode HPCP algorithm (http://essentia.upf.edu/documentation/reference/std_HPCP.html). Returns: hpcpgram of overlapped frames of input audio signal (2D vector) """ frameGenerator = es.FrameGenerator(array(audio), frameSize=frameSize, hopSize=hopSize) window = es.Windowing(type=windowType) spectrum = es.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html spectralPeaks = es.SpectralPeaks(magnitudeThreshold=magnitudeThreshold, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, sampleRate=sampleRate) # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html spectralWhitening = es.SpectralWhitening(maxFrequency=maxFrequency, sampleRate=sampleRate) # http://essentia.upf.edu/documentation/reference/std_HPCP.html hpcp = es.HPCP(sampleRate=sampleRate, maxFrequency=maxFrequency, minFrequency=minFrequency, size=numBins, **kwargs) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(window(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp', hpcp_vector) return pool['tonal.hpcp']
analysis_files.remove('.DS_Store') print len(analysis_files), '\nsongs in folder.\n' groundtruth_files = os.listdir(groundtruth_folder) if '.DS_Store' in groundtruth_files: groundtruth_files.remove('.DS_Store') # ANALYSIS # ======== song_chromas = [] for item in analysis_files: loader = estd.MonoLoader(filename=audio_folder + '/' + item, sampleRate=sample_rate) cut = estd.FrameCutter(frameSize=window_size, hopSize=hop_size) window = estd.Windowing(size=window_size, type=window_type) rfft = estd.Spectrum(size=window_size) sw = estd.SpectralWhitening(maxFrequency=max_frequency, sampleRate=sample_rate) speaks = estd.SpectralPeaks(magnitudeThreshold=magnitude_threshold, maxFrequency=max_frequency, minFrequency=min_frequency, maxPeaks=max_peaks, sampleRate=sample_rate) hpcp = estd.HPCP(bandPreset=band_preset, harmonics=harmonics, maxFrequency=max_frequency, minFrequency=min_frequency, nonLinear=non_linear, normalized=normalize, referenceFrequency=reference_frequency, sampleRate=sample_rate, size=hpcp_size, splitFrequency=split_frequency,
def chroma_hpcp(self, frameSize=4096, hopSize=2048, windowType='blackmanharris62', harmonicsPerPeak=8, magnitudeThreshold=1e-05, maxPeaks=1000, whitening=True, referenceFrequency=440, minFrequency=40, maxFrequency=5000, nonLinear=False, numBins=12, display=False): ''' Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using the default parameters as mentioned in [1]. Please refer to the following paper for detailed explanantion of the algorithm. [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. For full list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html Parameters harmonicsPerPeak : (integer ∈ [0, ∞), default = 0) : number of harmonics for frequency contribution, 0 indicates exclusive fundamental frequency contribution maxFrequency : (real ∈ (0, ∞), default = 5000) : the maximum frequency that contributes to the HPCP [Hz] (the difference between the max and split frequencies must not be less than 200.0 Hz) minFrequency : (real ∈ (0, ∞), default = 40) : the minimum frequency that contributes to the HPCP [Hz] (the difference between the min and split frequencies must not be less than 200.0 Hz) nonLinear : (bool ∈ {true, false}, default = false) : apply non-linear post-processing to the output (use with normalized='unitMax'). Boosts values close to 1, decreases values close to 0. normalized (string ∈ {none, unitSum, unitMax}, default = unitMax) : whether to normalize the HPCP vector referenceFrequency : (real ∈ (0, ∞), default = 440) : the reference frequency for semitone index calculation, corresponding to A3 [Hz] sampleRate : (real ∈ (0, ∞), default = 44100) : the sampling rate of the audio signal [Hz] numBins : (integer ∈ [12, ∞), default = 12) : the size of the output HPCP (must be a positive nonzero multiple of 12) whitening : (boolean (True, False), default = False) Optional step of computing spectral whitening to the output from speakPeak magnitudes ''' audio = array(self.audio_vector) #print audio.shape frameGenerator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize) window = estd.Windowing(type=windowType) spectrum = estd.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/std_SpectralPeaks.html spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=0, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, orderBy="frequency", sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/std_SpectralWhitening.html spectralWhitening = estd.SpectralWhitening(maxFrequency=maxFrequency, sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/std_HPCP.html hpcp = estd.HPCP(sampleRate=self.fs, maxFrequency=maxFrequency, minFrequency=minFrequency, referenceFrequency=referenceFrequency, nonLinear=nonLinear, harmonics=harmonicsPerPeak, size=numBins) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(window(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp', hpcp_vector) if display: display_chroma(np.swapaxes(pool['tonal.hpcp']), 0, 1) return pool['tonal.hpcp']
def key_aes(input_audio_file, output_text_file, **kwargs): """ This function estimates the overall key of an audio track optionally with extra modal information. :type input_audio_file: str :type output_text_file: str """ if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if np.sum(pcp) > 0: if not kwargs["DETUNING_CORRECTION"] or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(pcp) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': pcp = _detuning_correction(pcp, kwargs["HPCP_SIZE"]) chroma.append(pcp) else: raise NameError( "SHIFT_SCOPE musts be set to 'frame' or 'average'.") if not chroma: return 'Silence' chroma = np.sum(chroma, axis=0) chroma = norm_peak(chroma) if kwargs["PCP_THRESHOLD"] is not None: chroma = vector_threshold(chroma, kwargs["PCP_THRESHOLD"]) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) # Adjust to essentia's HPCP calculation starting on A (pc = 9) chroma = np.roll(chroma, -3 * (kwargs["HPCP_SIZE"] // 12)) estimation_1 = estimate_key(chroma, kwargs["KEY_PROFILE"], kwargs["PROFILE_INTERPOLATION"], conf_thres=kwargs["NOKEY_THRESHOLD"], vocabulary=kwargs["KEY_VOCABULARY"]) key_1 = estimation_1[0] correlation_value = estimation_1[1] if kwargs["WITH_MODAL_DETAILS"]: estimation_2 = _key7(chroma, kwargs["PROFILE_INTERPOLATION"]) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic track to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 textfile = open(output_text_file, 'w') textfile.write(key) textfile.close() return key, correlation_value
def estimate_key(input_audio_file, output_text_file=None, key_profile=None): """ This function estimates the overall key of an audio track optionaly with extra modal information. :type input_audio_file: str :type output_text_file: str """ if key_profile is not None: global USE_THREE_PROFILES global WITH_MODAL_DETAILS global KEY_PROFILE KEY_PROFILE = key_profile USE_THREE_PROFILES = False WITH_MODAL_DETAILS = False loader = estd.MonoLoader(filename=input_audio_file, sampleRate=SAMPLE_RATE) cut = estd.FrameCutter(frameSize=WINDOW_SIZE, hopSize=HOP_SIZE) window = estd.Windowing(size=WINDOW_SIZE, type=WINDOW_SHAPE) rfft = estd.Spectrum(size=WINDOW_SIZE) sw = estd.SpectralWhitening(maxFrequency=MAX_HZ, sampleRate=SAMPLE_RATE) speaks = estd.SpectralPeaks(magnitudeThreshold=SPECTRAL_PEAKS_THRESHOLD, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, maxPeaks=SPECTRAL_PEAKS_MAX, sampleRate=SAMPLE_RATE) hpcp = estd.HPCP( bandPreset=HPCP_BAND_PRESET, #bandSplitFrequency=HPCP_SPLIT_HZ, harmonics=HPCP_HARMONICS, maxFrequency=MAX_HZ, minFrequency=MIN_HZ, nonLinear=HPCP_NON_LINEAR, normalized=HPCP_NORMALIZE, referenceFrequency=HPCP_REFERENCE_HZ, sampleRate=SAMPLE_RATE, size=HPCP_SIZE, weightType=HPCP_WEIGHT_TYPE, windowSize=HPCP_WEIGHT_WINDOW_SEMITONES, maxShifted=HPCP_SHIFT) if HIGHPASS_CUTOFF is not None: hpf = estd.HighPass(cutoffFrequency=HIGHPASS_CUTOFF, sampleRate=SAMPLE_RATE) audio = hpf(hpf(hpf(loader()))) else: audio = loader() duration = len(audio) n_slices = 1 + (duration // HOP_SIZE) chroma = np.empty([n_slices, HPCP_SIZE], dtype='float64') for slice_n in range(n_slices): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) if SPECTRAL_WHITENING: p2 = sw(spek, p1, p2) pcp = hpcp(p1, p2) if not DETUNING_CORRECTION or DETUNING_CORRECTION_SCOPE == 'average': chroma[slice_n] = pcp elif DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'frame': pcp = shift_pcp(pcp, HPCP_SIZE) chroma[slice_n] = pcp else: raise NameError("SHIFT_SCOPE must be set to 'frame' or 'average'.") chroma = np.sum(chroma, axis=0) if PCP_THRESHOLD is not None: chroma = normalize_pcp_peak(chroma) chroma = pcp_gate(chroma, PCP_THRESHOLD) if DETUNING_CORRECTION and DETUNING_CORRECTION_SCOPE == 'average': chroma = shift_pcp(chroma, HPCP_SIZE) chroma = np.roll( chroma, -3) # Adjust to essentia's HPCP calculation starting on A... if USE_THREE_PROFILES: estimation_1 = template_matching_3(chroma, KEY_PROFILE) else: estimation_1 = template_matching_2(chroma, KEY_PROFILE) key_1 = estimation_1[0] + '\t' + estimation_1[1] correlation_value = estimation_1[2] if WITH_MODAL_DETAILS: estimation_2 = template_matching_modal(chroma) key_2 = estimation_2[0] + '\t' + estimation_2[1] key_verbose = key_1 + '\t' + key_2 key = key_verbose.split('\t') # Assign monotonic tracks to minor: if key[3] == 'monotonic' and key[0] == key[2]: key = '{0}\tminor'.format(key[0]) else: key = key_1 else: key = key_1 if output_text_file is not None: textfile = open(output_text_file, 'w') textfile.write(key + '\t' + str(correlation_value) + '\n') textfile.close() return key, correlation_value
def key_ecir(input_audio_file, output_text_file, **kwargs): if not kwargs: kwargs = KEY_SETTINGS loader = estd.MonoLoader(filename=input_audio_file, sampleRate=kwargs["SAMPLE_RATE"]) cut = estd.FrameCutter(frameSize=kwargs["WINDOW_SIZE"], hopSize=kwargs["HOP_SIZE"]) window = estd.Windowing(size=kwargs["WINDOW_SIZE"], type=kwargs["WINDOW_SHAPE"]) rfft = estd.Spectrum(size=kwargs["WINDOW_SIZE"]) sw = estd.SpectralWhitening(maxFrequency=kwargs["MAX_HZ"], sampleRate=kwargs["SAMPLE_RATE"]) speaks = estd.SpectralPeaks( magnitudeThreshold=kwargs["SPECTRAL_PEAKS_THRESHOLD"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], maxPeaks=kwargs["SPECTRAL_PEAKS_MAX"], sampleRate=kwargs["SAMPLE_RATE"]) hpcp = estd.HPCP(bandPreset=kwargs["HPCP_BAND_PRESET"], splitFrequency=kwargs["HPCP_SPLIT_HZ"], harmonics=kwargs["HPCP_HARMONICS"], maxFrequency=kwargs["MAX_HZ"], minFrequency=kwargs["MIN_HZ"], nonLinear=kwargs["HPCP_NON_LINEAR"], normalized=kwargs["HPCP_NORMALIZE"], referenceFrequency=kwargs["HPCP_REFERENCE_HZ"], sampleRate=kwargs["SAMPLE_RATE"], size=kwargs["HPCP_SIZE"], weightType=kwargs["HPCP_WEIGHT_TYPE"], windowSize=kwargs["HPCP_WEIGHT_WINDOW_SEMITONES"], maxShifted=kwargs["HPCP_SHIFT"]) key = estd.Key(numHarmonics=kwargs["KEY_HARMONICS"], pcpSize=kwargs["HPCP_SIZE"], profileType=kwargs["KEY_PROFILE"], slope=kwargs["KEY_SLOPE"], usePolyphony=kwargs["KEY_POLYPHONY"], useThreeChords=kwargs["KEY_USE_THREE_CHORDS"]) audio = loader() if kwargs["HIGHPASS_CUTOFF"] is not None: hpf = estd.HighPass(cutoffFrequency=kwargs["HIGHPASS_CUTOFF"], sampleRate=kwargs["SAMPLE_RATE"]) audio = hpf(hpf(hpf(audio))) if kwargs["DURATION"] is not None: audio = audio[(kwargs["START_TIME"] * kwargs["SAMPLE_RATE"]):(kwargs["DURATION"] * kwargs["SAMPLE_RATE"])] duration = len(audio) number_of_frames = int(duration / kwargs["HOP_SIZE"]) chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) # p1 = frequencies; p2 = magnitudes if kwargs["SPECTRAL_WHITENING"]: p2 = sw(spek, p1, p2) vector = hpcp(p1, p2) sum_vector = np.sum(vector) if sum_vector > 0: if kwargs["DETUNING_CORRECTION"] == False or kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma.append(vector) elif kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'frame': vector = _detuning_correction(vector, kwargs["HPCP_SIZE"]) chroma.append(vector) else: print("SHIFT_SCOPE must be set to 'frame' or 'average'") chroma = np.mean(chroma, axis=0) if kwargs["DETUNING_CORRECTION"] and kwargs[ "DETUNING_CORRECTION_SCOPE"] == 'average': chroma = _detuning_correction(chroma, kwargs["HPCP_SIZE"]) key = key(chroma.tolist()) confidence = (key[2], key[3]) key = key[0] + '\t' + key[1] textfile = open(output_text_file, 'w') textfile.write(key + '\n') textfile.close() return key, confidence
def hpcp(self, frameSize=4096, windowType='blackmanharris62', harmonicsPerPeak=8, magnitudeThreshold=0, maxPeaks=100, whitening=True, referenceFrequency=440, minFrequency=100, maxFrequency=3500, nonLinear=False, numBins=12, display=False): """ Compute Harmonic Pitch Class Profiles (HPCP) for the input audio files using essentia standard mode using the default parameters as mentioned in [1]. Please refer to the following paper for detailed explanantion of the algorithm. [1]. Gómez, E. (2006). Tonal Description of Polyphonic Audio for Music Content Processing. For full list of parameters of essentia standard mode HPCP please refer to http://essentia.upf.edu/documentation/reference/std_HPCP.html Returns hpcp: ndarray(n_frames, 12) The HPCP coefficients at each time frame """ audio = array(self.audio_vector) frameGenerator = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=self.hop_length) # framecutter = estd.FrameCutter(frameSize=frameSize, hopSize=self.hop_length) windowing = estd.Windowing(type=windowType) spectrum = estd.Spectrum() # Refer http://essentia.upf.edu/documentation/reference/streaming_SpectralPeaks.html spectralPeaks = estd.SpectralPeaks(magnitudeThreshold=magnitudeThreshold, maxFrequency=maxFrequency, minFrequency=minFrequency, maxPeaks=maxPeaks, orderBy="frequency", sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/streaming_SpectralWhitening.html spectralWhitening = estd.SpectralWhitening(maxFrequency= maxFrequency, sampleRate=self.fs) # http://essentia.upf.edu/documentation/reference/streaming_HPCP.html hpcp = estd.HPCP(sampleRate=self.fs, maxFrequency=maxFrequency, minFrequency=minFrequency, referenceFrequency=referenceFrequency, nonLinear=nonLinear, harmonics=harmonicsPerPeak, size=numBins) pool = Pool() #compute hpcp for each frame and add the results to the pool for frame in frameGenerator: spectrum_mag = spectrum(windowing(frame)) frequencies, magnitudes = spectralPeaks(spectrum_mag) if whitening: w_magnitudes = spectralWhitening(spectrum_mag, frequencies, magnitudes) hpcp_vector = hpcp(frequencies, w_magnitudes) else: hpcp_vector = hpcp(frequencies, magnitudes) pool.add('tonal.hpcp',hpcp_vector) if display: display_chroma(pool['tonal.hpcp'].T, self.hop_length) return pool['tonal.hpcp']
def get_spectral_info(frame): """Gets spectrum frequencies and their magnitudes for a single frame""" spectrum = es.Spectrum(size=samples_per_frame)(frame) freqs, mags = es.SpectralPeaks(**peak_params)(spectrum) mags = es.SpectralWhitening()(spectrum, freqs, mags) return spectrum, freqs, mags
def extractFeatures(audio_data): """ Recebe um vetor de reais representando um sinal de áudio, calcula suas features, agrega-as em uma Pool() de essentia e retorna esta Pool """ from numpy import ndarray assert (type(audio_data) is ndarray) assert ("float" in str(audio_data.dtype)) #Inicia Pool() output_pool = es.Pool() #Calcula espectro do sinal output_pool.set(pk_spectrum, es_mode.Spectrum()(audio_data)) #Calcula EnergyBandRatio energy_band_ratio = es_mode.EnergyBandRatio()(output_pool[pk_spectrum]) output_pool.set(pk_energy_band_ratio, energy_band_ratio) #Calcula MaxMagFreq max_mag_freq = es_mode.MaxMagFreq()(output_pool[pk_spectrum]) output_pool.set(pk_max_mag_freq, max_mag_freq) #Calcula SpectralCentroidTime spectral_centroid_time = es_mode.SpectralCentroidTime()(audio_data) output_pool.set(pk_spectral_centroid_time, spectral_centroid_time) #Calcula SpectralComplexity spectral_complexity = es_mode.SpectralComplexity()( output_pool[pk_spectrum]) output_pool.set(pk_spectral_complexity, spectral_complexity) #Calcula StrongPeak strong_peak = es_mode.StrongPeak()(output_pool[pk_spectrum]) output_pool.set(pk_strong_peak, strong_peak) #Calcula SpectralPeaks sp_freq, sp_mag = es_mode.SpectralPeaks()(output_pool[pk_spectrum]) #corta o DC, se houver, e pedido de HarmonicPeaks if sp_freq[0] == 0: sp_freq = sp_freq[1:] sp_mag = sp_mag[1:] output_pool.set(pk_spectral_peaks_freq, sp_freq) output_pool.set(pk_spectral_peaks_mag, sp_mag) ###################################### # Para Inharmonicity # ###################################### #Calcula PitchYinFFT pitch_yin_fft, pitch_prob_yin_fft = es_mode.PitchYinFFT()( output_pool[pk_spectrum]) output_pool.set(pk_pitch, pitch_yin_fft) output_pool.set(pk_pitch_prob, pitch_prob_yin_fft) #Calcula HarmonicPeaks hp_freq, hp_mag = es_mode.HarmonicPeaks()(output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag],\ output_pool[pk_pitch] ) output_pool.set(pk_harmonic_peaks_freq, hp_freq) output_pool.set(pk_harmonic_peaks_mag, hp_mag) #Calcula Inharmonicity inharmonicity = es_mode.Inharmonicity()(output_pool[pk_harmonic_peaks_freq],\ output_pool[pk_harmonic_peaks_mag]) output_pool.set(pk_inharmonicity, inharmonicity) #Acaba Inharmonicity##################################### #Calcula SpectralContrast frame_size = 2 * (output_pool[pk_spectrum].size - 1) spectral_contrast, spectral_valley = \ es_mode.SpectralContrast(frameSize=frame_size)(output_pool[pk_spectrum]) output_pool.set(pk_spectral_contrast, spectral_contrast) output_pool.set(pk_spectral_valley, spectral_valley) #Calcula SpectralWhitening spectral_whitening = \ es_mode.SpectralWhitening()(output_pool[pk_spectrum],\ output_pool[pk_spectral_peaks_freq],\ output_pool[pk_spectral_peaks_mag]) output_pool.set(pk_spectral_whitening, spectral_whitening) return output_pool
def key_detector(): reloj() # create directory to write the results with an unique time id: if results_to_file or results_to_csv: uniqueTime = str(int(tiempo())) wd = os.getcwd() temp_folder = wd + '/KeyDetection_' + uniqueTime os.mkdir(temp_folder) if results_to_csv: import csv csvFile = open(temp_folder + '/Estimation_&_PCP.csv', 'w') lineWriter = csv.writer(csvFile, delimiter=',') # retrieve files and filenames according to the desired settings: if analysis_mode == 'title': allfiles = os.listdir(audio_folder) if '.DS_Store' in allfiles: allfiles.remove('.DS_Store') for item in collection: collection[collection.index(item)] = ' > ' + item + '.' for item in genre: genre[genre.index(item)] = ' < ' + item + ' > ' for item in modality: modality[modality.index(item)] = ' ' + item + ' < ' analysis_files = [] for item in allfiles: if any(e1 for e1 in collection if e1 in item): if any(e2 for e2 in genre if e2 in item): if any(e3 for e3 in modality if e3 in item): analysis_files.append(item) song_instances = len(analysis_files) print song_instances, 'songs matching the selected criteria:' print collection, genre, modality if limit_analysis == 0: pass elif limit_analysis < song_instances: analysis_files = sample(analysis_files, limit_analysis) print "taking", limit_analysis, "random samples...\n" else: analysis_files = os.listdir(audio_folder) if '.DS_Store' in analysis_files: analysis_files.remove('.DS_Store') print len(analysis_files), '\nsongs in folder.\n' groundtruth_files = os.listdir(groundtruth_folder) if '.DS_Store' in groundtruth_files: groundtruth_files.remove('.DS_Store') # ANALYSIS # ======== if verbose: print "ANALYSING INDIVIDUAL SONGS..." print "=============================" if confusion_matrix: matrix = 24 * 24 * [0] mirex_scores = [] for item in analysis_files: # INSTANTIATE ESSENTIA ALGORITHMS # =============================== loader = estd.MonoLoader(filename=audio_folder + '/' + item, sampleRate=sample_rate) cut = estd.FrameCutter(frameSize=window_size, hopSize=hop_size) window = estd.Windowing(size=window_size, type=window_type) rfft = estd.Spectrum(size=window_size) sw = estd.SpectralWhitening(maxFrequency=max_frequency, sampleRate=sample_rate) speaks = estd.SpectralPeaks(magnitudeThreshold=magnitude_threshold, maxFrequency=max_frequency, minFrequency=min_frequency, maxPeaks=max_peaks, sampleRate=sample_rate) hpcp = estd.HPCP(bandPreset=band_preset, harmonics=harmonics, maxFrequency=max_frequency, minFrequency=min_frequency, nonLinear=non_linear, normalized=normalize, referenceFrequency=reference_frequency, sampleRate=sample_rate, size=hpcp_size, splitFrequency=split_frequency, weightType=weight_type, windowSize=weight_window_size) key = estd.Key(numHarmonics=num_harmonics, pcpSize=hpcp_size, profileType=profile_type, slope=slope, usePolyphony=use_polyphony, useThreeChords=use_three_chords) # ACTUAL ANALYSIS # =============== audio = loader() duration = len(audio) if skip_first_minute and duration > (sample_rate * 60): audio = audio[sample_rate * 60:] duration = len(audio) if first_n_secs > 0: if duration > (first_n_secs * sample_rate): audio = audio[:first_n_secs * sample_rate] duration = len(audio) if avoid_edges > 0: initial_sample = (avoid_edges * duration) / 100 final_sample = duration - initial_sample audio = audio[initial_sample:final_sample] duration = len(audio) number_of_frames = duration / hop_size chroma = [] for bang in range(number_of_frames): spek = rfft(window(cut(audio))) p1, p2 = speaks(spek) # p1 are frequencies; p2 magnitudes if spectral_whitening: p2 = sw(spek, p1, p2) vector = hpcp(p1, p2) sum_vector = np.sum(vector) if sum_vector > 0: if shift_spectrum == False or shift_scope == 'average': chroma.append(vector) elif shift_spectrum and shift_scope == 'frame': vector = shift_vector(vector, hpcp_size) chroma.append(vector) else: print "shift_scope must be set to 'frame' or 'average'" chroma = np.mean(chroma, axis=0) if shift_spectrum and shift_scope == 'average': chroma = shift_vector(chroma, hpcp_size) estimation = key(chroma.tolist()) result = estimation[0] + ' ' + estimation[1] confidence = estimation[2] if results_to_csv: chroma = list(chroma) # MIREX EVALUATION: # ================ if analysis_mode == 'title': ground_truth = item[item.find(' = ') + 3:item.rfind(' < ')] if verbose and confidence < confidence_threshold: print item[:item.rfind(' = ')] print 'G:', ground_truth, '|| P:', if results_to_csv: title = item[:item.rfind(' = ')] lineWriter.writerow([ title, ground_truth, chroma[0], chroma[1], chroma[2], chroma[3], chroma[4], chroma[5], chroma[6], chroma[7], chroma[8], chroma[9], chroma[10], chroma[11], chroma[12], chroma[13], chroma[14], chroma[15], chroma[16], chroma[17], chroma[18], chroma[19], chroma[20], chroma[21], chroma[22], chroma[23], chroma[24], chroma[25], chroma[26], chroma[27], chroma[28], chroma[29], chroma[30], chroma[31], chroma[32], chroma[33], chroma[34], chroma[35], result ]) ground_truth = key_to_list(ground_truth) estimation = key_to_list(result) score = mirex_score(ground_truth, estimation) mirex_scores.append(score) else: filename_to_match = item[:item.rfind('.')] + '.txt' print filename_to_match if filename_to_match in groundtruth_files: groundtruth_file = open( groundtruth_folder + '/' + filename_to_match, 'r') ground_truth = groundtruth_file.readline() if "\t" in ground_truth: ground_truth = re.sub("\t", " ", ground_truth) if results_to_csv: lineWriter.writerow([ filename_to_match, chroma[0], chroma[1], chroma[2], chroma[3], chroma[4], chroma[5], chroma[6], chroma[7], chroma[8], chroma[9], chroma[10], chroma[11], chroma[12], chroma[13], chroma[14], chroma[15], chroma[16], chroma[17], chroma[18], chroma[19], chroma[20], chroma[21], chroma[22], chroma[23], chroma[24], chroma[25], chroma[26], chroma[27], chroma[28], chroma[29], chroma[30], chroma[31], chroma[32], chroma[33], chroma[34], chroma[35], result ]) ground_truth = key_to_list(ground_truth) estimation = key_to_list(result) score = mirex_score(ground_truth, estimation) mirex_scores.append(score) else: print "FILE NOT FOUND... Skipping it from evaluation.\n" continue # CONFUSION MATRIX: # ================ if confusion_matrix: xpos = (ground_truth[0] + (ground_truth[0] * 24)) + (-1 * (ground_truth[1] - 1) * 24 * 12) ypos = ((estimation[0] - ground_truth[0]) + (-1 * (estimation[1] - 1) * 12)) matrix[(xpos + ypos)] = +matrix[(xpos + ypos)] + 1 if verbose and confidence < confidence_threshold: print result, '(%.2f)' % confidence, '|| SCORE:', score, '\n' # WRITE RESULTS TO FILE: # ===================== if results_to_file: with open(temp_folder + '/' + item[:-3] + 'txt', 'w') as textfile: textfile.write(result) textfile.close() if results_to_csv: csvFile.close() print len(mirex_scores), "files analysed in", reloj(), "secs.\n" if confusion_matrix: matrix = np.matrix(matrix) matrix = matrix.reshape(24, 24) print matrix if results_to_file: np.savetxt( temp_folder + '/_confusion_matrix.csv', matrix, fmt='%i', delimiter=',', header= 'C,C#,D,Eb,E,F,F#,G,G#,A,Bb,B,Cm,C#m,Dm,Ebm,Em,Fm,F#m,Gm,G#m,Am,Bbm,Bm' ) # MIREX RESULTS # ============= evaluation_results = mirex_evaluation(mirex_scores) # WRITE INFO TO FILE # ================== if results_to_file: settings = "SETTINGS\n========\nAvoid edges ('%' of duration disregarded at both ends (0 = complete)) = " + str( avoid_edges ) + "\nfirst N secs = " + str( first_n_secs ) + "\nshift spectrum to fit tempered scale = " + str( shift_spectrum ) + "\nspectral whitening = " + str( spectral_whitening ) + "\nsample rate = " + str(sample_rate) + "\nwindow size = " + str( window_size ) + "\nhop size = " + str(hop_size) + "\nmagnitude threshold = " + str( magnitude_threshold ) + "\nminimum frequency = " + str( min_frequency ) + "\nmaximum frequency = " + str( max_frequency ) + "\nmaximum peaks = " + str(max_peaks) + "\nband preset = " + str( band_preset ) + "\nsplit frequency = " + str( split_frequency ) + "\nharmonics = " + str(harmonics) + "\nnon linear = " + str( non_linear ) + "\nnormalize = " + str( normalize ) + "\nreference frequency = " + str( reference_frequency ) + "\nhpcp size = " + str( hpcp_size ) + "\nweigth type = " + weight_type + "\nweight window size in semitones = " + str( weight_window_size ) + "\nharmonics key = " + str(num_harmonics) + "\nslope = " + str( slope) + "\nprofile = " + profile_type + "\npolyphony = " + str( use_polyphony) + "\nuse three chords = " + str( use_three_chords) results_for_file = "\n\nEVALUATION RESULTS\n==================\nCorrect: " + str( evaluation_results[0]) + "\nFifth: " + str( evaluation_results[1]) + "\nRelative: " + str( evaluation_results[2]) + "\nParallel: " + str( evaluation_results[3]) + "\nError: " + str( evaluation_results[4]) + "\nWeighted: " + str( evaluation_results[5]) write_to_file = open(temp_folder + '/_SUMMARY.txt', 'w') write_to_file.write(settings) write_to_file.write(results_for_file) if analysis_mode == 'title': corpus = "\n\nANALYSIS CORPUS\n===============\n" + str( collection) + '\n' + str( genre) + '\n' + str(modality) + '\n\n' + str( len(mirex_scores)) + " files analysed.\n" write_to_file.write(corpus) write_to_file.close()