def __init__(self, audio_file, mono=True, hop_length=512, sample_rate=44100, normalize_gain=False, verbose=False): """[summary] Arguments: audio_file {[type]} -- [description] Keyword Arguments: mono {bool} -- [description] (default: {True}) hop_length {int} -- [description] (default: {512}) sample_rate {int} -- [description] (default: {44100}) normalize_gain {bool} -- [description] (default: {False}) verbose {bool} -- [description] (default: {False}) """ self.hop_length = hop_length self.fs = sample_rate self.audio_file = audio_file if normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() elif mono: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)() if verbose: print( "== Audio vector of %s loaded with shape %s and sample rate %s ==" % (audio_file, self.audio_vector.shape, self.fs))
def transcribe_from_paths(audio_path, topath, sr=22050, cuda=False): """ `cuda` has no effect at now """ audio = esst.EasyLoader(filename=audio_path, sampleRate=sr)() mat = transcribe(audio, sr, cuda=cuda) mat2midipath(mat, topath)
def read_audio(self, audio_file): self.set_audio_file(audio_file) if self.normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() elif self.mono: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)()
def load_audio_file(file_path, sample_rate=44100): """ Load audio file using essentia's EasyLoader class :param file_path: audio file path :param sample_rate: audio sample rate :return: audio data (numpy.ndarray of float32) """ audio_file = estd.EasyLoader(filename=file_path, sampleRate=sample_rate) audio = audio_file.compute() return audio
def __init__(self, audio_file, mono=True, hop_length=512, sample_rate=44100, normalize_gain=False, verbose=False): """""" self.hop_length = hop_length self.fs = sample_rate self.audio_file = audio_file if normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() elif mono: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)() if verbose: print ("== Audio vector of %s loaded with shape %s and sample rate %s ==" % (audio_file, self.audio_vector.shape, self.fs))
def analyze_misc(filename, segment_duration=20): # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) windowing = es.Windowing(type='blackmanharris62') spectrum = es.Spectrum() powerspectrum = es.PowerSpectrum() centroid = es.Centroid() zcr = es.ZeroCrossingRate() rms = es.RMS() hfc = es.HFC() pool = essentia.Pool() audio = loader() for frame in es.FrameGenerator(audio, frameSize=2048, hopSize=1024): frame_spectrum = spectrum(windowing(frame)) pool.add('rms', rms(frame)) pool.add('rms_spectrum', rms(frame_spectrum)) pool.add('hfc', hfc(frame_spectrum)) pool.add('spectral_centroid', centroid(frame_spectrum)) pool.add('zcr', zcr(frame)) audio_st, sr, _, _, _, _ = es.AudioLoader(filename=filename)() # Ugly hack because we don't have a StereoResample left, right = es.StereoDemuxer()(audio_st) resampler = es.Resample(inputSampleRate=sr, outputSampleRate=44100) left = resampler(left) right = resampler(right) audio_st = es.StereoMuxer()(left, right) audio_st = es.StereoTrimmer(startTime=segment_start, endTime=segment_end)(audio_st) ebu_momentary, _, _, _ = es.LoudnessEBUR128(hopSize=1024 / 44100, startAtZero=True)(audio_st) pool.set('ebu_momentary', ebu_momentary) return pool
def __init__(self, audio_file, mono=True, sample_rate=44100, normalize_gain=False): """""" self.fs = sample_rate if normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() else: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)() print "== Audio vector of %s loaded with shape %s and sample rate %s ==" % ( audio_file, self.audio_vector.shape, self.fs) return
def analyze_loops(db_file="../loopermanscrapy/loops.db"): loop_data = {} conn = sqlite3.connect(db_file) cur = conn.cursor() sample_rate = 44100 for loop in loops: cur.execute("SELECT BPM FROM LOOPS WHERE OUTPUT_FILE LIKE '%s'" % loop.rstrip()) bpm = cur.fetchone()[0] audio_file = estd.EasyLoader(filename="./drum_loops/" + loop.rstrip(), sampleRate=sample_rate) audio = audio_file.compute() beat_duration = (60.0 * sample_rate) / int(bpm) L = [beat_duration * n for n in range(1, 128)] thr_lambda = 0.5 * beat_duration la = audio.shape[0] delta = min([abs(l - la) for l in L]) if delta > thr_lambda: ann_confidence = 0.0 else: ann_confidence = (1.0 - float(delta) / thr_lambda) duration = la / sample_rate loop_data[loop.rstrip()] = { "bpm_annotated": bpm, "duration": duration, "annotated_confidence": ann_confidence, "bpm_percival": bpm_percival, "confidence": confidence } conn.close() json.dump(loop_data, open("loop_data.json", 'w'))
def load_audio_files_in_dir(path): vsts = [] original = [] paths = [] for root, dirs, files in os.walk(path): FOUND_VST = False vst = [] for file in files: if file.endswith(AUDIO_EXTS) and 'scales' not in file: if 'target' in file: audios = original else: FOUND_VST = True audios = vst audios.append( esst.EasyLoader(filename=os.path.join(root, file), sampleRate=SR)()) if FOUND_VST: vsts.append(vst) paths.append(root) return vsts, original, paths
def load_audio_excerpts(path=AUDIO_PATH, num_features=9): """ Extracts `num_features+1` MFCC coeffcients from each audio and discards the first coefficients (tied to energy). """ targets = np.zeros((3, 5, num_features)) out = np.zeros((3, 5, 4, num_features)) for file in tqdm(os.listdir(path)): if file.endswith(excerpt_search.FORMAT): audio = esst.EasyLoader(filename=os.path.join(path, file), sampleRate=SR)() if audio.shape[0] % 2 == 1: audio = audio[:-1] spectrum = esst.Spectrum(size=audio.shape[0])(audio) _bands, features = esst.MFCC(inputSize=spectrum.shape[0], sampleRate=SR, numberCoefficients=num_features + 1)(spectrum) splits = file.replace('.flac', '').split('_') question = int(splits[0][1]) _fill_out_targets(out[question], targets[question], features[1:], splits, 'target') return out - targets[..., np.newaxis, :]
def transcribe_from_paths(audio_path, data, velocity_model, midi_score_path=None, tofile='out.mid'): """ Load a midi and an audio file and call `transcribe`. If `tofile` is not empty, it will also write a new MIDI file with the provided path. The output midi file will contain only one track with piano (program 0) """ import essentia.standard as esst audio = esst.EasyLoader(filename=audio_path, sampleRate=SR)() if midi_score_path: score = midipath2mat(midi_score_path) else: score = None new_score, _, _, _ = transcribe(audio, data, score=score, velocity_model=velocity_model) # writing to midi mat2midipath(new_score, tofile) return new_score
#wavFiles = sys.argv[2:] #the mbid corresponding to the title of the wavfile will be used to name the resulting pitch track file mbid_all={"D_X-1": "61dd663f-77b8-423b-8ce9-4c40fee8b014","D_X-2": "a00666b5-07ef-40c8-8117-78b871569354","D_E-1": "f6339039-e01a-4a70-82ff-001c64fecddd","J_X-1": "bc86b730-b7bb-46ba-a771-2d0e6304b459","J_X-2": "a9896439-46ce-4a37-a6ed-3a5c998bd6cf","J_E-1": "b19d1921-9119-4a57-8c78-540ca09f09b9","J_E-2": "720e93d6-21b5-4026-8c50-5c415cca53ec","LD_X-1": "5593350d-4a2d-4288-b3c6-b57b7f3c8dd4","LD_X-2": "afd9da3d-418b-4dfc-85d4-6d5829b9c7e2","LD_E-1": "f581a2b2-ff94-4e03-91f8-d278b01d6fc4","LD_E-2": "67069b89-fe41-4509-9aba-5cc33d9c8e53","LS_X-1": "cdbf88f4-7a55-412e-b4b2-351f4c8aab49", "LS_X-2": "4a0fc23c-9c62-4b2e-a38a-124796c6ac5d","LS_E-1": "999917df-2282-49d4-ad95-e45d176fba64","LS_E-2": "26054c12-3578-4d64-a251-f8f694fbc41c","XS_X-1": "e9794b6f-a797-4d7f-842a-a0c73639c2a5","XS_X-2": "206d833f-b194-40cc-92d1-7a632fbef603","XS_E-1": "aec9929b-69b4-4ca5-b998-f4d54ac426cb","XS_E-2": "01661eb4-114d-4671-af82-14b8fead56e1"} #print pitchDir hopSize = 128 frameSize = 2048 sampleRate = 44100 guessUnvoiced = False for f in wavFiles: if ".wav" not in f.lower(): continue #mbid = f.split("/")[-1][:-4] #if exists(pitchDir+"/"+mbid+".txt"): continue #find the mbid to be used to name the resulting pitch track name=f.split("_")[0]+"_"+f.split("_")[1] print name mbid=mbid_all[name] print mbid loader = es.EasyLoader(filename=f, sampleRate=44100) equalLoudness = es.EqualLoudness(sampleRate=44100) audio = loader() audioDL = equalLoudness(audio) pitchPolyphonic = es.PredominantMelody(binResolution=1, guessUnvoiced=guessUnvoiced, hopSize=hopSize, minFrequency=100, maxFrequency = 1200, voicingTolerance = 1.2) res = pitchPolyphonic(audioDL) t = np.linspace(0, len(res[0])*128.0/44100, len(res[0])) data = zip(t, res[0], res[1]) data = np.array(data) np.savetxt(mbid+".txt", data, delimiter="\t")
def analyze_hp(filename, segment_duration=20): lowlevelFrameSize = 2048 lowlevelHopSize = 1024 tonalFrameSize = 4096 tonalHopSize = 1024 # Compute replay gain and duration on the entire file, then load the # segment that is centered in time with replaygain applied audio = es.MonoLoader(filename=filename)() replaygain = es.ReplayGain()(audio) segment_start = (len(audio) / 44100 - segment_duration) / 2 segment_end = segment_start + segment_duration if segment_start < 0 or segment_end > len(audio) / 44100: raise ValueError( 'Segment duration is larger than the input audio duration') loader = es.EasyLoader(filename=filename, replayGain=replaygain, startTime=segment_start, endTime=segment_end) window = es.Windowing(type='blackmanharris62') fft = es.FFT() stft = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=lowlevelFrameSize, hopSize=lowlevelHopSize): stft.append(fft(window(frame))) # Librosa requires bins x frames format stft = np.array(stft).T D_harmonic, D_percussive = librosa.decompose.hpss(stft, margin=8) D_percussive_magnitude, _ = librosa.magphase(D_percussive) D_harmonic_magnitude, _ = librosa.magphase(D_harmonic) # Convert back to Essentia format (frames x bins) spectrum_harmonic = D_harmonic_magnitude.T specturm_percussive = D_percussive_magnitude.T # Processing for Mel bands melbands = es.MelBands(numberBands=96, lowFrequencyBound=0, highFrequencyBound=11025) # Normalize Mel bands: log10(1+x*10000) norm = es.UnaryOperator(type='identity', shift=1, scale=10000) log10 = es.UnaryOperator(type='log10') p = essentia.Pool() for spectrum_frame in spectrum_harmonic: p.add('melbands_harmonic', log10(norm(melbands(spectrum_frame)))) for spectrum_frame in specturm_percussive: p.add('melbands_percussive', log10(norm(melbands(spectrum_frame)))) return p
def create_excerpt(audio_path, time, name): """ Given audio path and times, transcribes it and creates new midis and wav files for the given excerpts. `name` is the file name without extension and transcription number. """ full_audio = esst.EasyLoader(filename=audio_path, sampleRate=SR)() start_audio, _ = find_start_stop(full_audio, sample_rate=SR, seconds=True) original = midipath2mat(audio_path[:-4] + '.mid') # compute score path score_path = './my_scores/' + os.path.basename(audio_path)[:-8] + '.mid' score = midipath2mat(score_path) # transcribe data = pickle.load(open(TEMPLATE_PATH, 'rb')) transcription_0, _, _, _ = proposed.transcribe(full_audio, data, score=score) transcription_1 = magenta_transcription.transcribe(full_audio, SR) # transcription_2, _, _, _ = proposed.transcribe(full_audio, # data, # score=None) # chose another interpretation performance = '01' if audio_path[-6:-4] == '01': performance = '02' other = midipath2mat(audio_path[:-6] + performance + '.mid') # segment all the scores and audios full_audio = esst.EasyLoader(filename=audio_path, sampleRate=OUT_SR)() original_audio = full_audio[round(time[0][0] * OUT_SR):round(time[0][1] * OUT_SR)] other_time = remap_original_in_other(original, other, time[0]) original = segment_mat(original, time[0][0], time[0][1], start_audio) other = segment_mat(other, other_time[0], other_time[1], start_audio) transcription_0 = segment_mat(transcription_0, time[0][0], time[0][1], start_audio) transcription_1 = segment_mat(transcription_1, time[0][0], time[0][1], start_audio) # transcription_2 = segment_mat(transcription_2, time[0][0], time[0][1], # start_audio) # write scores to `to_be_synthesized` and audios to `excerpts` if not os.path.exists('to_be_synthesized'): os.mkdir('to_be_synthesized') midi_path = os.path.join('to_be_synthesized', name) mat2midipath(original, midi_path + 'orig.mid') mat2midipath(other, midi_path + 'other.mid') mat2midipath(transcription_0, midi_path + 'proposed.mid') mat2midipath(transcription_1, midi_path + 'magenta.mid') # mat2midipath(transcription_2, midi_path + 'vienna.mid') if not os.path.exists('audio'): os.mkdir('audio') audio_path = os.path.join('audio', name) + 'target.' + FORMAT # write audio if os.path.exists(audio_path): os.remove(audio_path) esst.MonoWriter(filename=audio_path, sampleRate=OUT_SR, format=FORMAT, bitrate=256)(original_audio)
def main(): import essentia.standard as esst spec = esst.SpectrumCQ(numberBins=BINS, sampleRate=SR, windowType='hann') print("Loading midi") notes = pm.PrettyMIDI(midi_file=SCALE_PATH[0]).instruments[0].notes print("Loading audio") audio = esst.EasyLoader(filename=SCALE_PATH[1], sampleRate=SR)() # template = np.zeros((FRAME_SIZE // 2 + 1, 128, BASIS)) template = np.zeros((BINS, 128, BASIS)) counter = np.zeros((128, BASIS)) maxpitch = 0 minpitch = 128 for i in trange(len(notes)): note = notes[i] if maxpitch < note.pitch: maxpitch = note.pitch if minpitch > note.pitch: minpitch = note.pitch # start and end frame start = int(np.round((note.start) * SR)) end = int(np.round((note.end) * SR)) ENDED = False spd = np.zeros((BINS, BASIS)) frames = esst.FrameGenerator(audio[start:end], frameSize=FRAME_SIZE, hopSize=HOP_SIZE) # attack for a in range(ATTACK): try: frame = next(frames) except StopIteration: print("Error: notes timing not correct") print(f"note: {start}, {end}, {len(audio)}") sys.exit(99) spd[:, 0] += spec(frame) counter[note.pitch, 0] += ATTACK # other basis except the last one for b in range(1, BASIS-1): if not ENDED: for a in range(BASIS_L): try: frame = next(frames) except StopIteration: # note is shorter than the number of basis ENDED = True break spd[:, b] += spec(frame) counter[note.pitch, b] += 1 # last basis if not ENDED: for frame in frames: spd[:, BASIS-1] += spec(frame) counter[note.pitch, BASIS-1] += 1 template[:, note.pitch, :] += spd idx = np.nonzero(counter) template[:, idx[0], idx[1]] /= counter[idx] # collapsing basis and pitch dimension template = template.reshape((-1, 128 * BASIS), order='C') # plot template fig = go.Figure(data=go.Heatmap(z=template)) fig.show() # saving template pickle.dump((template, minpitch, maxpitch), open(TEMPLATE_PATH, 'wb'))
def transcribe_from_paths(audio_path, topath, sr=44100, cuda=False): audio = esst.EasyLoader(filename=audio_path, sampleRate=sr)() mat = transcribe(audio, sr, cuda=cuda) mat2midipath(mat, topath)
def plot_segmentation(audio, onset, sustain, release=None, offset=None): plt.figure() plt.vlines(onset, 0, 1, label='attack_init', color='g') plt.vlines(sustain, 0, 1, label='sustain_init', color='b') plt.vlines(release, 0, 1, label='sustain_end', color='r') plt.vlines(offset, 0, 1, label='sustain_end', color='g') time = np.linspace(0, len(audio) / 512, num=len(audio)) plt.plot(time, audio) plt.show() # INIT PARAMETERS # path to the audio file file_path = settings.DATA_PATH + '/47_58_2.wav' fs = 44100 hopSize = 512 frameSize = 2048 rms_onset_threshold = 1E-5 mel_onset_threshold = 80 flux_onset_threshold = 0.1 onset_threshold = 0.1 max_attack_time = seconds2samples(0.5, fs) audio_file = estd.EasyLoader(filename=file_path, sampleRate=fs) audio = audio_file.compute() onset, sustain = segment(audio, hopSize, frameSize, rms_onset_threshold, mel_onset_threshold, flux_onset_threshold, onset_threshold) plot_segmentation(audio, onset, sustain)