def get_hpeaks_per_frame(audio, sr=44100, onlyfrecuencies=False, nsines=20): """ Get Harmonic peaks in an audio :param audio: Audio either mono or stereo. Will be downsampled to mono :param sr: Samplerate used for the audio :return: Nx2x100. N is the number of resulting frames. 2x100 are the frequencies and magnitudes respectively. """ if audio.ndim > 1: audio = std.MonoMixer()(audio, audio.shape[1]) fft_algo = std.FFT() pyin = std.PitchYin() hpeaks = std.HarmonicPeaks() sine_anal = std.SineModelAnal(maxnSines=nsines, orderBy='frequency', minFrequency=1) sines = [] for i, frame in enumerate( std.FrameGenerator(audio, frameSize=4096, hopSize=2048)): pitch, _ = pyin(frame) fft = fft_algo(frame) freqs, mags, _ = sine_anal(fft) sorting_indexes = np.argsort(freqs) freqs = freqs[sorting_indexes] mags = mags[sorting_indexes] non_zero_freqs = np.where(freqs != 0) freqs = freqs[non_zero_freqs] mags = mags[non_zero_freqs] freqs, mags = hpeaks(freqs, mags, pitch) sines.append([freqs, mags]) sines = np.array(sines) if onlyfrecuencies: return sines[:, 0, :] else: return sines[:, 0, :], sines[:, 1, :]
def get_Yin_Pitch(self): if self.audio != []: pitchDetect = es.PitchYin(frameSize=self.frameSize, sampleRate=self.sampleRate) estPitch = [] pitchConfidence = [] frame_times = [] counter = 0 for frame in es.FrameGenerator(self.audio, frameSize=self.frameSize, hopSize=self.hopSize): f, conf = pitchDetect(frame) estPitch += [f] pitchConfidence += [conf] frame_times.append(counter*self.hopSize/self.sampleRate) counter+=1 return np.array(estPitch), pitchConfidence, frame_times else: return None, None, None
# Temporal descriptors power = es.InstantPower() log_attack_time = es.LogAttackTime() effective_duration = es.EffectiveDuration() auto_correlation = es.AutoCorrelation() zero_crossing_rate = es.ZeroCrossingRate() # Spectral descriptors peak_freq = es.MaxMagFreq() roll_off = es.RollOff() flux = es.Flux() flatness = es.Flatness() # Harmonic descriptors pitch = es.PitchYin(frameSize=1024) spectral_peaks = es.SpectralPeaks(minFrequency=1e-5) harmonic_peaks = es.HarmonicPeaks() inharmonicity = es.Inharmonicity() oer = es.OddToEvenHarmonicEnergyRatio() tristimulus = es.Tristimulus() # MFCC mfcc = es.MFCC(inputSize=513) class Audio: def __init__(self, path): self.audio = es.MonoLoader(filename=str(path))() self.name = path.name self.pool = essentia.Pool()
def pitchCalculation(audio, start_end_samples, frameSize, sampleRate, maxFrequency): PITCHYIN = ess.PitchYin(frameSize=frameSize, sampleRate=sampleRate, maxFrequency=maxFrequency) pitch, pitchConfidence = PITCHYIN(audio[start_end_samples[0]:start_end_samples[1]]) return pitch, pitchConfidence
import os import matplotlib.pyplot as plt mbid = 'ead85d20-ce7d-4ed0-a00d-0ae199b94d12' hopSize = 128 loader = es.MonoLoader(filename=os.path.join(mbid, mbid + '-voice.mp3')) track = loader() # track[track<0.000001] = 0 print(len(track) / 44100.) # pY = es.PitchYin(minFrequency=55, maxFrequency=900, tolerance=0.06) pY = es.PitchYin(minFrequency=55, maxFrequency=600, tolerance=0.03) rms = es.RMS() pitch = [] loudness = [] print('Computing pitch and loudness') for frame in es.FrameGenerator(track, frameSize=2048, hopSize=hopSize, startFromZero=True): f = pY(frame) if f[1] >= 0.8: pitch.append(f[0]) loudness.append(rms(frame)) else:
import essentia.standard as ess import numpy as np M = 1024 N = 1024 H = 512 fs = 44100 x = ess.MonoLoader(filename='output3.wav', sampleRate=fs)() ess.AudioLoader() spectrum = ess.Spectrum(size=N) window = ess.Windowing(size=M, type='hann') pitchYin = ess.PitchYin() hpcp = ess.HPCP() hpcps = [] spectralPeaks = ess.SpectralPeaks() pitches = [] pitchConfidences = [] for frame in ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True): pitch, pitchConfidence = pitchYin(frame) if pitchConfidence < 0.9: pitches.append(None)
def algorithm_pitch_note_essentia(sound): """ Estimates the note of a given audio file. :param sound: sound dictionary from dataset :return: dictionary with results per different methods """ results = dict() audio = load_audio_file(file_path=sound[SOUND_FILE_KEY], sample_rate=44100) frameSize = 1024 hopsize = frameSize # Estimate pitch using PitchYin frames = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopsize) pitchDetect = estd.PitchYin(frameSize=frameSize, sampleRate=44100) pitches = [] confidence = [] for frame in frames: f, conf = pitchDetect(frame) pitches += [f] confidence += [conf] pitches = [pitch for pitch in pitches if pitch > 0] if not pitches: pitch_median = 0.1 else: pitch_median = median(pitches) midi_note = frequency_to_midi_note(pitch_median) note = midi_note_to_note(midi_note) results.update({ 'EssentiaPitchYin': { 'note': note, 'midi_note': midi_note, 'pitch': pitch_median } }) # Estimate pitch using PithYinFFT frames = estd.FrameGenerator(audio, frameSize=frameSize, hopSize=hopsize) pitchDetect = estd.PitchYinFFT(frameSize=frameSize, sampleRate=44100) win = estd.Windowing(type='hann') pitches = [] confidence = [] for frame in frames: spec = estd.Spectrum()(win(frame)) f, conf = pitchDetect(spec) pitches += [f] confidence += [conf] pitches = [pitch for pitch in pitches if pitch > 0] if not pitches: pitch_median = 0.1 else: pitch_median = median(pitches) midi_note = frequency_to_midi_note(pitch_median) note = midi_note_to_note(midi_note) results.update({ 'EssentiaPitchYinFFT': { 'note': note, 'midi_note': midi_note, 'pitch': pitch_median } }) return results