def create_dissonances_examples(audio_target_file, file_compatibilities, audios_folder, n_examples=3, sr=44100): """ Given an audio and a file with its compatibilities create the mixes :param audio_target_file: The audio itself :param file_compatibilities: The compatibility file for the target audio :param audios_folder: The folder where are located all the audios :param n_examples: Number of examples to generate :param sr: sample rate of the final mix :return: A list where each element is a mix """ df = pd.read_csv(file_compatibilities) listoreturn = [] df_sorted = df.sort_values(by=['compatibility_framewise'], ascending=True).iloc[:n_examples, :] for idx, candidate in df_sorted.iterrows(): cand_f = candidate['filename'].split('/')[-1] pshift = candidate['pitch_shift_framewise'] audio_target = std.MonoLoader(filename=os.path.join( audios_folder, audio_target_file), sampleRate=sr)() audio_candidate = std.MonoLoader(filename=os.path.join( audios_folder, cand_f), sampleRate=sr)() audio_candidate = pitch_shift(audio_candidate, 44100, pshift).astype(np.float32) audio = mix(audio_target, audio_candidate, sr=sr) listoreturn.append(audio) return listoreturn
def mix(self, filename, synthesized_voice): # Get instrument lineup filename_violin = filename.replace("vocal.wav", "violin.wav") filename_mridangam_right = filename.replace("vocal.wav", "mridangam_right.wav") filename_mridangam_left = filename.replace("vocal.wav", "mridangam_left.wav") filename_tanpura = filename.replace("vocal.wav", "tanpura.wav") # Load audios and trim to synthesized voice length violin_mono = estd.MonoLoader(filename=filename_violin)() violin_mono_processed = np.array(violin_mono[:len(synthesized_voice) + 1], dtype='float64') violin_mono_processed_filt = self.filter_audio(audio=violin_mono_processed, coef=0.00075) mridangam_right_mono = estd.MonoLoader(filename=filename_mridangam_right)() mridangam_right_mono_processed = np.array(mridangam_right_mono[:len(synthesized_voice) + 1], dtype='float64') mridangam_right_mono_processed_filt = self.filter_audio(audio=mridangam_right_mono_processed, coef=0.001) mridangam_left_mono = estd.MonoLoader(filename=filename_mridangam_left)() mridangam_left_mono_processed = np.array(mridangam_left_mono[:len(synthesized_voice) + 1], dtype='float64') mridangam_left_mono_processed_filt = self.filter_audio(audio=mridangam_left_mono_processed, coef=0.001) tanpura_mono = estd.MonoLoader(filename=filename_tanpura)() tanpura_mono_processed = np.array(tanpura_mono[:len(synthesized_voice) + 1], dtype='float64') # Assign weights if self.mixing_weights: weight_voice = self.mixing_weights['voice'] weight_violin = self.mixing_weights['violin'] weight_mridangam_right = self.mixing_weights['mridangam_right'] weight_mridangam_left = self.mixing_weights['mridangam_left'] weight_tanpura = self.mixing_weights['tanpura'] else: # Predefined weights in case no weight dict is provided weight_voice = 5.25 weight_violin = 4 weight_mridangam_right = 1 weight_mridangam_left = 1 weight_tanpura = 33.5 # Get mix synthesized_audio_mix = [ x*weight_voice + y*weight_violin + z*weight_mridangam_right + w*weight_mridangam_left + t*weight_tanpura for x, y, z, w, t in zip( synthesized_voice, violin_mono_processed_filt, mridangam_right_mono_processed_filt, mridangam_left_mono_processed_filt, tanpura_mono_processed ) ] return synthesized_audio_mix
def get_melspecs(audio_file: Path, algorithms: dict) -> Optional[dict[str, np.ndarray]]: # loading file audio = ess.MonoLoader(filename=str(audio_file), sampleRate=SAMPLE_RATE)() # precompute melspecs melspecs_all = {} for algorithm_name in algorithms: parameters = algorithms[algorithm_name] melspec_extractor = getattr(ess, parameters['melspec-algorithm'])() melspecs = [] for frame in ess.FrameGenerator(audio, frameSize=parameters['frame-size'], hopSize=parameters['hop-size']): melspecs.append(melspec_extractor(frame)) melspecs = np.array(melspecs) # reshape melspecs into tensor batches and discard the remainder discard = melspecs.shape[0] % parameters['patch-size'] if discard != 0: melspecs = melspecs[:-discard, :] melspecs = np.reshape(melspecs, [-1, parameters['patch-size'], parameters['number-bands']]) batch = np.expand_dims(melspecs, 2) melspecs_all[algorithm_name] = batch return melspecs_all
def chop_folder(folder): count = 0 for loop in os.listdir(folder): try: loop_path = folder + loop audio_file = es.MonoLoader(filename=loop_path, sampleRate=sampleRate) audio = audio_file.compute() count += 1 if count % 50 == 0: print(count) chops = range(math.ceil(len(audio) / four_bar_length)) for chop in chops: if chop == chops[-1]: audio_to_save = np.zeros(four_bar_length) audio_to_save[:len(audio[chop * four_bar_length:] )] = audio[chop * four_bar_length:] sf.write(FLSD_CHOPPED_AUDIO_DIR + str(chop + 1) + loop, audio_to_save, sampleRate) else: audio_to_save = audio[chop * four_bar_length:(chop + 1) * four_bar_length] sf.write(FLSD_CHOPPED_AUDIO_DIR + str(chop + 1) + loop, audio_to_save, sampleRate) except: print(loop + " failed!")
def energyThresholdAudio(soundfilesList): for sound in soundfilesList: RMS = esst.RMS() audioLoader = esst.MonoLoader(filename=sound) audio = audioLoader() start=0 end=0 thresh=0.05 rms_vals=[] for frame in esst.FrameGenerator(audio, frameSize=2048, hopSize=1024, startFromZero=True): rms = RMS(frame) rms_vals.append(float(rms)) rms_vals = np.array(rms_vals) higher=np.where(rms_vals >= thresh)[0] if len(higher) > 1: start=higher[0] end=higher[-1] else: continue newAudio = audio[start*1024:end*1024] writer = esst.MonoWriter(filename=sound, format="mp3") writer(newAudio) print (sound)
def extract_predominant_vocal_melody(audio_filename, hopSize, frameSize, pYinInst, end_ts=None): ''' extract predominant vocal pitch contour as workaround, intersect extracted pitch with vocal annotation Parameters ----------------------- end_ts: extract until this ts, disregard the rest of the audio Returns ------------------- list of estimated pitch values in Hz, at non-vocal returns value <= 0 ''' if WITH_MELODIA: if WITH_MAKAM: #### use predominant melody tailored to makam path_Alignment_duration = os.path.join(parentDir, 'AlignmentDuration') if path_Alignment_duration not in sys.path: sys.path.append(path_Alignment_duration) from src.align.FeatureExtractor import extractPredominantMelodyMakam estimatedPitch_andTs = extractPredominantMelodyMakam( audio_filename[:-4], frameSize, hopSize, jointAnalysis=False, musicbrainzid=None, preload=True) #jointAnalysis=False, becasue no else: # use melodia estimatedPitch_andTs = extractPredominantMelody( audio_filename, frameSize, hopSize) else: ######### pYIN audio = ess.MonoLoader(filename=audio_filename, sampleRate=fs)() for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): featureSet = pYinInst.process(frame) estimatedPitch = pYinInst.decodePitchTrack() # pitch extraction ts = [] ### generated timestamps for onset_frame_number, frame in enumerate(estimatedPitch): ts.append(frame_to_ts(onset_frame_number, float(hopSize / fs))) estimatedPitch_andTs = np.vstack((np.array(ts), estimatedPitch)).T if end_ts is not None: idx_end_ts = np.searchsorted(estimatedPitch_andTs[:, 0], end_ts) # until end_ts estimatedPitch_andTs = estimatedPitch_andTs[:min( idx_end_ts + 1, estimatedPitch_andTs.shape[0]), :] if MonoNoteParameters.WITH_VOCAL_SEGMENTS: # vocal segments given estimatedPitch_andTs = intersect_vocal_segments( audio_filename, estimatedPitch_andTs) return estimatedPitch_andTs[:, 1]
def loadaudio(args): """util.loadaudio Load data from an audio file of any format supported by Monoloader """ loader = estd.MonoLoader(filename=args.file, sampleRate=args.samplerate) return loader()
def BatchProcess_TonicIdentification(RootDir, tonicExt='.tonic', FileExt2Proc=".wav", overwrite=0): audiofilenames = GetFileNamesInDir(RootDir, FileExt2Proc) for audiofilename in audiofilenames: print "processing %s" % audiofilename path, fileN = os.path.split(audiofilename) fname, ext = os.path.splitext(audiofilename) tonic_filename = fname + tonicExt if overwrite == 0 and os.path.isfile(tonic_filename): continue tonic_file = open(tonic_filename, 'w') audio = ES.MonoLoader(filename=audiofilename)() tonic = ES.TonicIndianArtMusic()(audio) MBID = audiofilename.split("/")[-1].strip() print MBID tonic_file.write(str(tonic) + "\n") tonic_file.close()
def featureExtraction(soundfiles): # extractor = esst.LowLevelSpectralExtractor() extractor = esst.Extractor(dynamics = False, dynamicsFrameSize = 88200, dynamicsHopSize = 44100, highLevel = False, lowLevel = True, lowLevelFrameSize = 2048, lowLevelHopSize = 1024, midLevel = True, namespace = "", relativeIoi = False, rhythm = False, sampleRate = 44100, tonalFrameSize = 4096, tonalHopSize = 2048, tuning = True) #soundfiles = listdir(inputPath) for file in soundfiles: path1= '/Users/helena/Desktop/SMC/ASP/sms-tools/workspace/A10/code/downloaded/' name=file[70:-4] + '_features.json' outPath = path1 + 'features/' + name print file audioLoader = esst.MonoLoader(filename=file) audio = audioLoader() pool = essentia.Pool() pool = extractor(audio) aggPool = esst.PoolAggregator()(pool) output = esst.YamlOutput(filename = outPath, format='json') output(aggPool) print (outPath + ' exported')
def extract_for_one(wavDataDir, lineList, filename, FILE_EXT_WAV): filename_wav = os.path.join(wavDataDir,filename+FILE_EXT_WAV) filename_wav_silence_removed = os.path.join(wavDataDir+'_silence_removed','temp'+FILE_EXT_WAV) ##-- remove the silence from audio sr = 44100 audio = ess.MonoLoader(filename=filename_wav,downmix='left',sampleRate=sr)() audio_remove_silence = removeSilence(audio,sr,lineList) wavfile.write(filename_wav_silence_removed,sr,audio_remove_silence) ##-- process the silence removed audio loader = essentia.streaming.EqloudLoader(filename=filename_wav_silence_removed) fEx = FeatureExtractor(frameSize=2048, hopSize=1024, sampleRate=loader.paramValue('sampleRate')) p = essentia.Pool() loader.audio >> fEx.signal for desc, output in fEx.outputs.items(): output >> (p, desc) essentia.run(loader) # convert pitch from hz to cents for i in range(len(p['pitch_instantaneous_pitch'])): p['pitch_instantaneous_pitch'][i] = hz2cents(p['pitch_instantaneous_pitch'][i]) stats = ['mean', 'var', 'dmean', 'dvar'] statsPool = essentia.standard.PoolAggregator(defaultStats=stats)(p) return statsPool
def getFeatSequence(inputFile,pulsePos): audio = ess.MonoLoader(filename = inputFile, sampleRate = params.Fs)() frameCounter = 0 pool = es.Pool() pool.add('samples',audio) for frame in ess.FrameGenerator(audio, frameSize = params.frmSize, hopSize = params.hop): ts = params.hop/params.Fs*frameCounter + params.frmSize/float(2*params.Fs) zpFrame = np.hstack((frame,zz)) mag = spec(window(zpFrame)) mfccBands,mfccSeq = genmfcc(mag) pool.add('rms',rms(mag)) pool.add('mfcc',mfccSeq) pool.add('time',ts) frameCounter += 1 if pulsePos != None: pulsePos = np.append(pulsePos,len(audio)/params.Fs) for tp in xrange(len(pulsePos)-1): pool.add('pst', pulsePos[tp]) pool.add('pet', pulsePos[tp+1]) temp1 = np.where(pool['time'] >= pulsePos[tp])[0] temp2 = np.where(pool['time'] < pulsePos[tp+1])[0] binIndices = np.intersect1d(temp1, temp2) pool.add('pmfcc', np.mean(pool['mfcc'][binIndices,:], axis = 0)) pool.add('prms', np.mean(pool['rms'][binIndices])) else: pool.add('pst', 0.0) pool.add('pet', len(audio)/params.Fs) pool.add('pmfcc', np.mean(pool['mfcc'], axis = 0)) pool.add('prms', np.mean(pool['rms'], axis = 0)) return pool
def extract(fname, outpath, fs=22050, fsize=1024, hsize=512): """ extract(fname, outpath, fs, fsize, hsize) will compute the mfcc of Audio file fname. Inputs: fname -- is the name of audio file. outpath -- is the output path of processed files. fs -- is the sampling frequency (Hz). fsize -- is the size of each frame. hsize -- is the hop size betwean frames. Outputs: the file contains the mfcc coefficents of audio file. in what format??? """ # gate(fname) loader = es.MonoLoader(filename=fname, sampleRate=fs) # length = len(loader) # maxim = max(loader) # for sample in loader: # if abs(sample) < maxim/20: # sample = 0 ; w = es.Windowing(type='hann') spectrum = es.Spectrum() mfcc = es.MFCC(inputSize=513, numberCoefficients=20) mfccs = [] audio = loader() for frame in es.FrameGenerator(audio, frameSize=1024, hopSize=512): mfcc_bands, mfcc_coeffs = mfcc(spectrum(w(frame))) mfccs.append(mfcc_coeffs) mfccs = np.array(mfccs) return mfcc
def __init__(self, audio_file, mono=True, hop_length=512, sample_rate=44100, normalize_gain=False, verbose=False): """[summary] Arguments: audio_file {[type]} -- [description] Keyword Arguments: mono {bool} -- [description] (default: {True}) hop_length {int} -- [description] (default: {512}) sample_rate {int} -- [description] (default: {44100}) normalize_gain {bool} -- [description] (default: {False}) verbose {bool} -- [description] (default: {False}) """ self.hop_length = hop_length self.fs = sample_rate self.audio_file = audio_file if normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() elif mono: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)() if verbose: print( "== Audio vector of %s loaded with shape %s and sample rate %s ==" % (audio_file, self.audio_vector.shape, self.fs))
def __init__(self, path): self.audio = es.MonoLoader(filename=str(path))() self.name = path.name self.pool = essentia.Pool() self._build_temporal_features() self._build_spectral_features() self._build_harmonic_features() self._build_mfcc() self._features = { 'audio_correlation': 'AC', 'audio_power': 'AP', 'audio_waveform': 'AWF', 'bandwidth': 'SB', 'effective_duration': 'ED', 'fundamental_freq': 'F0', 'inharmonicity': 'INH', 'log_attack_time': 'LAT', 'max_freq': 'FMax', 'mfcc': 'MFCC', 'min_freq': 'FMin', 'oer': 'OER', 'peak_ampl': 'PA', 'peak_freq': 'PF', 'spectral_centroid': 'SC', 'spectral_flatness': 'SF', 'spectral_flux': 'SFX', 'spectral_roll_off': 'SRO', 'spectral_spread': 'SS', 'temporal_centroid': 'TC', 'tristimulus': 'T', 'zcr': 'ZCR' }
def get_beat_chunks(filename, bpm_restrict=None): audio = std.MonoLoader(filename=filename)() hpcp = std.HPCP() spectrum = std.Spectrum() speaks = std.SpectralPeaks() large_speaks = std.SpectralPeaks(maxPeaks=2000) tivs = [] sr = 44100 bpm = get_tempo(filename) tivs_framewise = [] if bpm_restrict != None and bpm_restrict != bpm: raise ValueError sec_beat = (60 / bpm) beats = np.arange(0, len(audio) / sr, sec_beat) beats = np.append(beats, len(audio) / sr) for i in range(1, len(beats)): segmented_audio = audio[int(beats[i - 1] * sr):int(beats[i] * sr)] cutter = std.FrameGenerator(segmented_audio) for sec in cutter: spec = spectrum(sec) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs_framewise.append(chroma) np2_seg_audio = zeropad_next_power_2(segmented_audio) spec = spectrum(np2_seg_audio) freq, mag = speaks(spec) chroma = hpcp(freq, mag) tivs.append(chroma) # Calculate the whole TIV np2_whole = zeropad_next_power_2(audio) spec = spectrum(np2_whole) freq, mag = large_speaks(spec) chroma_whole = hpcp(freq, mag) return mt.TIVCollection.from_pcp(np.array(tivs).T), mt.TIV.from_pcp(chroma_whole), mt.TIVCollection.from_pcp(np.array(tivs_framewise).T)
def __get_signal__(self): """ :rtype: returns the audio signal by reading the file """ e_monoloader = e.MonoLoader(filename=self.fpath) self.signal = e_monoloader() self.signal_length = len(self.signal)
def computeEnergyHistogram(inputAudioFile, outputJsonFile, threshold, histograms): M = 2048 H = 1024 fs = 44100 energy = ess.Energy() x = ess.MonoLoader(filename=inputAudioFile, sampleRate=fs)() frames = ess.FrameGenerator(x, frameSize=M, hopSize=H, startFromZero=True) E = [] numFrames = 0 for frame in frames: numFrames += 1 E_frame = energy(frame) E.append(E_frame) E = np.array(E) E_norm = E / np.max(E) for i in range(len(threshold)): t = threshold[i] histograms[i] = np.append(histograms[i], [0] * (numFrames - len(histograms[i]))) idx_threshold = np.where(E_norm > t) histograms[i][idx_threshold[0]] += 1
def featureExtraction(soundfiles): #extractor = esst.LowLevelSpectralExtractor() extractor = esst.Extractor(dynamics=True, dynamicsFrameSize=88200, dynamicsHopSize=44100, highLevel=True, lowLevel=True, lowLevelFrameSize=2048, lowLevelHopSize=1024, midLevel=True, namespace="", relativeIoi=False, rhythm=True, sampleRate=44100, tonalFrameSize=4096, tonalHopSize=2048, tuning=True) #soundfiles = listdir(inputPath) for file, outPath in soundfiles: audioLoader = esst.MonoLoader(filename=file) audio = audioLoader() pool = essentia.Pool() pool = extractor(audio) aggPool = esst.PoolAggregator()(pool) esst.YamlOutput(filename=outPath + 'features.json', format='json')(aggPool) print(file + ' exported')
def mfccFeature_audio(filename_wav, index_keep, feature_type='mfcc'): audio = ess.MonoLoader(downmix='left', filename=filename_wav, sampleRate=fs)() if feature_type == 'mfcc': feature = getFeature(audio) elif feature_type == 'mfccBands1D': feature = getMFCCBands1D(audio) elif feature_type == 'mfccBands2D': feature = getMFCCBands2D(audio, nbf=True) if feature_type == 'mfccBands1D' or feature_type == 'mfccBands2D': feature = np.log(100000 * feature + 1) scaler = pickle.load(open(kerasScaler_path, 'rb')) feature = scaler.transform(feature) # feature = preprocessing.StandardScaler().fit_transform(feature) # index_keep = pitchProcessing_audio(filename_wav) feature_out = feature[index_keep[0], :] for index in index_keep[1:]: feature_out = np.vstack((feature_out, feature[index, :])) if feature_type == 'mfccBands2D': feature_out = featureReshape(feature_out) return feature_out
def load_audio(self): # loads the audio # apply equal-loudness filter for PredominantPitchMelodia loader = es.MonoLoader(filename=self.filename, sampleRate=self.sample_rate) self.audio = loader() xvals = np.arange(len(self.audio)) / float(self.sample_rate) self.xlim = [0, max(xvals)]
def duration(infile): """ Returns the duration of a song in seconds. """ dur = standard.Duration() audio = standard.MonoLoader(filename=infile)() duration = dur(audio) return duration
def get_number_beats(filename): audio = std.MonoLoader(filename=filename)() sr = 44100 bpm = get_tempo(filename) sec_beat = (60 / bpm) beats = np.arange(0, len(audio) / sr, sec_beat) beats = np.append(beats, len(audio) / sr) return len(beats)
def estimate_beats(infile): """ Return the estimated beat onsets in seconds for an audio file. """ audio = standard.MonoLoader(filename=infile)() bt = standard.BeatTrackerMultiFeature() beats, confidence = bt(audio) return beats
def features(filename): wav,fs = soundfile.read(filename) audio = ess.MonoLoader(downmix = 'left',filename = filename,sampleRate =fs)() features,d_MRCG,dd_MRCG = MRCG(audio,fs=fs) print(features) return
def extractor(filename): fs = 44100 audio = ess.MonoLoader(filename=filename, sampleRate=fs)() # dynamic range expansion as done in HTK implementation audio = audio * 2**15 frameSize = 1102 # corresponds to htk default WINDOWSIZE = 250000.0 hopSize = 441 # corresponds to htk default TARGETRATE = 100000.0 fftSize = 2048 spectrumSize = fftSize // 2 + 1 zeroPadding = fftSize - frameSize w = ess.Windowing( type='hamming', # corresponds to htk default USEHAMMING = T size=frameSize, zeroPadding=zeroPadding, normalized=False, zeroPhase=False) spectrum = ess.Spectrum(size=fftSize) mfcc_htk = ess.MFCC( inputSize=spectrumSize, type='magnitude', # htk uses mel filterbank magniude warpingFormula='htkMel', # htk's mel warping formula weighting='linear', # computation of filter weights done in Hz domain highFrequencyBound=8000, # corresponds to htk default lowFrequencyBound=0, # corresponds to htk default numberBands=26, # corresponds to htk default NUMCHANS = 26 numberCoefficients=13, normalize= 'unit_max', # htk filter normaliation to have constant height = 1 dctType=3, # htk uses DCT type III logType='log', liftering=22) # corresponds to htk default CEPLIFTER = 22 mfccs = [] # startFromZero = True, validFrameThresholdRatio = 1 : the way htk computes windows for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize, startFromZero=True, validFrameThresholdRatio=1): spect = spectrum(w(frame)) mel_bands, mfcc_coeffs = mfcc_htk(spect) mfccs.append(mfcc_coeffs) # transpose to have it in a better shape # we need to convert the list to an essentia.array first (== numpy.array of floats) # mfccs = essentia.array(pool['MFCC']).T mfccs = essentia.array(mfccs).T # and plot plt.imshow(mfccs[1:, :], aspect='auto', interpolation='none') # ignore enery # plt.imshow(mfccs, aspect = 'auto', interpolation='none') plt.show() # unnecessary if you started "ipython --pylab"
def __init__(self, parent=None, **options): # # Inputs: # parent : parent QtCanvas # # Options: # width (int) : width of plot # height(int) : height of plot # dpi (int) : resolution of plot # xlim (tuple) : (x0,x1) # ylim (tuple) : (x0,x1) self._width = 5 self._height = 5 self._dpi = 100 if "width" in options: self._width = options.get("width") if "height" in options: self._height = options.get("height") if "dpi" in options: self._dpi = options.get("dpi") if "filename" in options: self.filename = options.get("filename") self.audio = es.MonoLoader(filename=self.filename, sampleRate=44100)() else: self.filename = None self.audio = None self.is_playing = False self.play_rate = 44100 # use keys 0 to 9 to reduce speed from 100 to 90% # Set the x and y limits of the window self.xlim = (0, 20) self.ylim = (0, 20) # create figure and axes for plotting self.fig = Figure(figsize=(self._width, self._height), dpi=self._dpi) self.ax = self.fig.add_subplot(111) # Figure Canvas initialization FigureCanvas.__init__(self, self.fig) self.setParent(parent) self.setFocusPolicy(Qt.ClickFocus) self.setFocus() # initialize variables used for calculation of spectrogram #initialize figure FigureCanvas.setSizePolicy(self, QtWidgets.QSizePolicy.Expanding, QtWidgets.QSizePolicy.Expanding) FigureCanvas.updateGeometry(self) self.key_pressed_cid = self.fig.canvas.mpl_connect( 'key_press_event', self.on_key_press) self.show()
def pYINPtNote(filename1,fs=44100,frameSize=2048,hopSize=256): ''' Given filename, return pitchtrack and note transcription track :param filename1: :param fs: :param frameSize: :param hopSize: :return: ''' # initialise pYinInst = pYINmain.PyinMain() pYinInst.initialise(channels = 1, inputSampleRate = fs, stepSize = hopSize, blockSize = frameSize, lowAmp = 0.25, onsetSensitivity = 0.7, pruneThresh = 0.1) # frame-wise calculation audio = ess.MonoLoader(filename = filename1, sampleRate = fs)() # rms mean # rms = [] # for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): # rms.append(RMS(frame, frameSize)) # rmsMean = np.mean(rms) # print 'rmsMean', rmsMean for frame in ess.FrameGenerator(audio, frameSize=frameSize, hopSize=hopSize): fs = pYinInst.process(frame) # calculate smoothed pitch and mono note monoPitch = pYinInst.getSmoothedPitchTrack() # output smoothed pitch track print('pitch track') for ii in fs.m_oSmoothedPitchTrack: print(ii.values) print('\n') fs = pYinInst.getRemainingFeatures(monoPitch) # output of mono notes, # column 0: frame number, # column 1: pitch in midi numuber, this is the decoded pitch # column 2: attack 1, stable 2, silence 3 print('mono note decoded pitch') for ii in fs.m_oMonoNoteOut: print(ii.frameNumber, ii.pitch, ii.noteState) print('\n') print('note pitch tracks') for ii in fs.m_oNotePitchTracks: print(ii) print('\n') # median pitch in Hz of the notes print('median note pitch') for ii in fs.m_oNotes: print(ii.values) print('\n')
def compute_all_features(audio_file, audio_beats=False): """Computes all the features for a specific audio file and its respective human annotations. Returns ------- features : dict Dictionary with the following features: mfcc : np.array Mel Frequency Cepstral Coefficients representation hpcp : np.array Harmonic Pitch Class Profiles tonnets : np.array Tonal Centroids (or Tonnetz) """ # Makes sure the output features folder exists utils.ensure_dir(OUTPUT_FEATURES) features_file = os.path.join(OUTPUT_FEATURES, os.path.basename(audio_file) + ".json") # If already precomputed, read and return if os.path.exists(features_file): with open(features_file, "r") as f: features = json.load(f) return list_to_array(features) # Load Audio logging.info("Loading audio file %s" % os.path.basename(audio_file)) audio = ES.MonoLoader(filename=audio_file, sampleRate=SAMPLE_RATE)() duration = len(audio) / float(SAMPLE_RATE) # Estimate Beats features = {} ticks, conf = compute_beats(audio) ticks = np.concatenate(([0], ticks, [duration])) # Add first and last time ticks = essentia.array(np.unique(ticks)) features["beats"] = ticks.tolist() # Compute Beat-sync features features["mfcc"], features["hpcp"], features["tonnetz"] = \ compute_beatsync_features(ticks, audio) # Save output as audio file if audio_beats: logging.info("Saving Beats as an audio file") marker = ES.AudioOnsetsMarker(onsets=ticks, type='beep', sampleRate=SAMPLE_RATE) marked_audio = marker(audio) ES.MonoWriter(filename='beats.wav', sampleRate=SAMPLE_RATE)(marked_audio) # Save features with open(features_file, "w") as f: json.dump(features, f) return list_to_array(features)
def read_audio(self, audio_file): self.set_audio_file(audio_file) if self.normalize_gain: self.audio_vector = estd.EasyLoader(filename=audio_file, sampleRate=self.fs, replayGain=-9)() elif self.mono: self.audio_vector = estd.MonoLoader(filename=audio_file, sampleRate=self.fs)()
def piano_timing_features(anno_file, audio_file, latency, bpm, max_spectral_centroid=3500, onset_threshold=2, series_delta=0.22, sample_rate=44100): bars, beats, events, chords = symbolic_analysis.rhythm_for_file(anno_file) beats = np.array(beats) events = np.array(events) is_defined = [x[0] != 'N' for x in chords] chords = chords[is_defined] events = events[is_defined] # LOAD AUDIO audio = ess.MonoLoader(filename=audio_file)() duration = float(len(audio)) / sample_rate half_ibi = (beats[1:] - beats[:-1]).mean() / 2 start = max(events[0] - half_ibi, 0) end = min(events[-1] + half_ibi, duration) # LOAD BEATS FROM AUDIO onset_func = ess.OnsetDetectionGlobal()(audio) # CHANGE SILENCE THRESHOLD DEPENDING ON THE BPM silence_th = 0.2 if bpm >= 40 and bpm < 50: silence_th = 0.2 if bpm >= 50 and bpm < 60: silence_th = 0.15 if bpm >= 60 and bpm < 70: silence_th = 0.1 if bpm >= 70 and bpm < 80: silence_th = 0.05 if bpm >= 80: silence_th = 0.02 # COMPUTE ONSETS FROM AUDIO onsets = np.array( list( ess.Onsets(alpha=1, silenceThreshold=silence_th)([onset_func], [1]))) # COMPUTE DEVIATIONS BETWEEN ANNOTATION AND COMPUTED ONSETS devs = feature_extraction.attack_deviations(events, onsets, start, end) f, p, r = onset_measures(events, onsets, f_measure_threshold=0.25) features = { 'onsets': onsets, 'devs': devs, 'f_measure': f, 'precision': p, 'recall': r } return features