def features(filename): # print '\t[1/5] loading audio' y, sr = librosa.load(filename, sr=SR) # print '\t[2/5] Separating harmonic and percussive signals' y_perc, y_harm = hp_sep(y) # print '\t[3/5] detecting beats' bpm, beats = get_beats(y=y_perc, sr=sr, hop_length=HOP_LENGTH) # print '\t[4/5] generating CQT' M1 = np.abs( librosa.cqt(y=y_harm, sr=sr, hop_length=HOP_LENGTH, bins_per_octave=12, fmin=librosa.midi_to_hz(24), n_bins=72) ) M1 = librosa.logamplitude(M1 ** 2.0, ref_power=np.max) # print '\t[5/5] generating MFCC' S = librosa.feature.melspectrogram(y=y, sr=sr, hop_length=HOP_LENGTH, n_mels=N_MELS) M2 = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=N_MFCC) n = min(M1.shape[1], M2.shape[1]) beats = beats[beats < n] beats = np.unique(np.concatenate([[0], beats])) times = librosa.frames_to_time(beats, sr=sr, hop_length=HOP_LENGTH) times = np.concatenate([times, [float(len(y)) / sr]]) M1 = librosa.feature.sync(M1, beats, aggregate=np.median) M2 = librosa.feature.sync(M2, beats, aggregate=np.mean) return (M1, M2), times
def __test(y, top_db, ref, trim_duration): yt, idx = librosa.effects.trim(y, top_db=top_db, ref=ref) # Test for index position fidx = [slice(None)] * y.ndim fidx[-1] = slice(*idx.tolist()) assert np.allclose(yt, y[fidx]) # Verify logamp rms = librosa.feature.rmse(y=librosa.to_mono(yt), center=False) logamp = librosa.logamplitude(rms**2, ref=ref, top_db=None) assert np.all(logamp > - top_db) # Verify logamp rms_all = librosa.feature.rmse(y=librosa.to_mono(y)).squeeze() logamp_all = librosa.logamplitude(rms_all**2, ref=ref, top_db=None) start = int(librosa.samples_to_frames(idx[0])) stop = int(librosa.samples_to_frames(idx[1])) assert np.all(logamp_all[:start] <= - top_db) assert np.all(logamp_all[stop:] <= - top_db) # Verify duration duration = librosa.get_duration(yt) assert np.allclose(duration, trim_duration, atol=1e-1), duration
def compute_features(audio, y_harmonic): """Computes the HPCP and MFCC features. Parameters ---------- audio: np.array(N) Audio samples of the given input. y_harmonic: np.array(N) Harmonic part of the audio signal, in samples. Returns ------- mfcc: np.array(N, msaf.Anal.mfcc_coeff) Mel-frequency Cepstral Coefficients. hpcp: np.array(N, 12) Pitch Class Profiles. tonnetz: np.array(N, 6) Tonal Centroid features. cqt: np.array(N, msaf.Anal.cqt_bins) Constant-Q log-scale features. tempogram: np.array(N, 192) Tempogram features. """ logging.info("Computing Spectrogram...") S = librosa.feature.melspectrogram(audio, sr=msaf.Anal.sample_rate, n_fft=msaf.Anal.frame_size, hop_length=msaf.Anal.hop_size, n_mels=msaf.Anal.n_mels) logging.info("Computing Constant-Q...") cqt = librosa.logamplitude(np.abs( librosa.cqt(audio, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_bins=msaf.Anal.cqt_bins, real=False)) ** 2, ref_power=np.max).T logging.info("Computing MFCCs...") log_S = librosa.logamplitude(S, ref_power=np.max) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=msaf.Anal.mfcc_coeff).T logging.info("Computing HPCPs...") hpcp = librosa.feature.chroma_cqt(y=y_harmonic, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, n_octaves=msaf.Anal.n_octaves, fmin=msaf.Anal.f_min).T logging.info("Computing Tonnetz...") tonnetz = utils.chroma_to_tonnetz(hpcp) logging.info("Computing Tempogram...") tempogram = librosa.feature.tempogram(audio, sr=msaf.Anal.sample_rate, hop_length=msaf.Anal.hop_size, win_length=192).T return mfcc, hpcp, tonnetz, cqt, tempogram
def do_cqt(src, track_id): SRC_cqt_L = librosa.logamplitude(librosa.cqt(src[0,:], sr=CQT_CONST["sr"], hop_length=CQT_CONST["hop_len"], bins_per_octave=CQT_CONST["bins_per_octave"], n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0) SRC_cqt_R = librosa.logamplitude(librosa.cqt(src[1,:], sr=CQT_CONST["sr"], hop_length=CQT_CONST["hop_len"], bins_per_octave=CQT_CONST["bins_per_octave"], n_bins=CQT_CONST["n_bins"])**2, ref_power=1.0) np.save(PATH_CQT + str(track_id) + '.npy', np.dstack((SRC_cqt_L, SRC_cqt_R))) print "Done: %s" % str(track_id)
def process_one_file(audio_file, midi_file, output_midi_file, pair_file, diagnostics_file): """ Wrapper routine for loading in audio/MIDI data, aligning, and writing out the result. Parameters ---------- audio_file, midi_file, output_midi_file, pair_file, diagnostics_file : str Paths to the audio file to align, MIDI file to align, and paths where to write the aligned MIDI, the synthesized pair file, and the DTW diagnostics file. """ # Load in the audio data audio_data, _ = librosa.load(audio_file, sr=create_data.FS) # Compute the log-magnitude CQT of the data audio_cqt, audio_times = create_data.extract_cqt(audio_data) audio_cqt = librosa.logamplitude(audio_cqt, ref_power=audio_cqt.max()).T # Load and synthesize MIDI data midi_object = pretty_midi.PrettyMIDI(midi_file) midi_audio = midi_object.fluidsynth(fs=create_data.FS) # Compute log-magnitude CQT midi_cqt, midi_times = create_data.extract_cqt(midi_audio) midi_cqt = librosa.logamplitude(midi_cqt, ref_power=midi_cqt.max()).T # Compute cosine distance matrix distance_matrix = scipy.spatial.distance.cdist( midi_cqt, audio_cqt, 'cosine') # Get lowest cost path p, q, score = djitw.dtw( distance_matrix, GULLY, np.median(distance_matrix), inplace=False) # Normalize by path length score = score/len(p) # Normalize by distance matrix submatrix within path score = score/distance_matrix[p.min():p.max(), q.min():q.max()].mean() # Adjust the MIDI file midi_object.adjust_times(midi_times[p], audio_times[q]) # Write the result midi_object.write(output_midi_file) # Synthesize aligned MIDI midi_audio_aligned = midi_object.fluidsynth(fs=create_data.FS) # Adjust to the same size as audio if midi_audio_aligned.shape[0] > audio_data.shape[0]: midi_audio_aligned = midi_audio_aligned[:audio_data.shape[0]] else: trim_amount = audio_data.shape[0] - midi_audio_aligned.shape[0] midi_audio_aligned = np.append(midi_audio_aligned, np.zeros(trim_amount)) # Stack one in each channel librosa.output.write_wav( pair_file, np.array([midi_audio_aligned, audio_data]), create_data.FS) # Write out diagnostics with open(diagnostics_file, 'wb') as f: json.dump({'p': list(p), 'q': list(q), 'score': score}, f)
def do_HPS_on_CQT(CQT, track_id): '''HPS on CQT input CQT: log-amplitude. ''' CQT = 10**(0.05*CQT) # log_am --> linear (with ref_power=1.0) ret_H = np.zeros(CQT.shape) ret_P = np.zeros(CQT.shape) for depth_cqt in xrange(CQT.shape[2]): ret_H[:,:,depth_cqt], ret_P[:,:,depth_cqt] = librosa.decompose.hpss(CQT[:,:,depth_cqt]) np.save(PATH_CQT_H+str(track_id)+'.npy', librosa.logamplitude(ret_H)) np.save(PATH_CQT_P+str(track_id)+'.npy', librosa.logamplitude(ret_P)) print "Done: %d, HPS for CQT " % track_id
def process_audio(infile): y, sr = librosa.load(infile, sr=SR) # 1. Compute magnitude spectrogram D = np.abs(librosa.stft(y, n_fft=N_FFT, hop_length=HOP)) # 2. Compute HPSS Harm, Perc = hpss(y) # 3. Compute RPCA Lowrank, Sparse, _ = rpca.robust_pca(D, max_iter=RPCA_MAX_ITER) Lowrank = np.maximum(0.0, Lowrank) Sparse = np.maximum(0.0, Sparse) D = np.abs(D)**2 Harm = np.abs(Harm)**2 Perc = np.abs(Perc)**2 Lowrank = np.abs(Lowrank)**2 Sparse = np.abs(Sparse)**2 S = librosa.feature.melspectrogram(S=librosa.logamplitude(D, ref_power=D.max()), sr=sr, n_mels=N_MELS, fmax=FMAX) Harm = librosa.feature.melspectrogram(S=librosa.logamplitude(Harm, ref_power=Harm.max()), sr=sr, n_mels=N_MELS, fmax=FMAX) Perc = librosa.feature.melspectrogram(S=librosa.logamplitude(Perc, ref_power=Perc.max()), sr=sr, n_mels=N_MELS, fmax=FMAX) Lowrank = librosa.feature.melspectrogram(S=librosa.logamplitude(Lowrank, ref_power=Lowrank.max()), sr=sr, n_mels=N_MELS, fmax=FMAX) Sparse = librosa.feature.melspectrogram(S=librosa.logamplitude(Sparse, ref_power=Sparse.max()), sr=sr, n_mels=N_MELS, fmax=FMAX) return S, Harm, Perc, Lowrank, Sparse
def compute_features(self): """Actual implementation of the features. Returns ------- cqt: np.array(N, F) The features, each row representing a feature vector for a give time frame/beat. """ linear_cqt = ( np.abs( librosa.cqt( self._audio, sr=self.sr, hop_length=self.hop_length, n_bins=self.n_bins, norm=self.norm, filter_scale=self.filter_scale, real=False, ) ) ** 2 ) cqt = librosa.logamplitude(linear_cqt, ref_power=self.ref_power).T return cqt
def compute_melgram(audio_path): ''' Compute a mel-spectrogram and returns it in a shape of (1,1,96,1366), where 96 == #mel-bins and 1366 == #time frame parameters ---------- audio_path: path for the audio file. Any format supported by audioread will work. More info: http://librosa.github.io/librosa/generated/librosa.core.load.html#librosa.core.load ''' # mel-spectrogram parameters SR = 12000 N_FFT = 512 N_MELS = 96 HOP_LEN = 256 DURA = 29.12 # to make it 1366 frame.. src, sr = librosa.load(audio_path, sr=SR) #whole signal n_sample = src.shape[0] n_sample_fit = int(DURA*SR) if n_sample < n_sample_fit: # if too short src = np.hstack((src, np.zeros((int(DURA*SR) - n_sample,)))) elif n_sample > n_sample_fit: # if too long src = src[(n_sample-n_sample_fit)/2:(n_sample+n_sample_fit)/2] ret = librosa.logamplitude(librosa.feature.melspectrogram(y=src, sr=SR, hop_length=HOP_LEN, n_fft=N_FFT, n_mels=N_MELS)**2, ref_power=1.0) ret = ret[np.newaxis, np.newaxis, :] return ret
def transform_audio(audio, n_fft=2048, n_mels=40, sr=22050, hop_length=512, fmin=None, fmax=None): # Midi values of 24 (C2) and 120 (C10) are chosen, since humans typically # can't hear much beyond this range. if not fmin: fmin = librosa.midi_to_hz(24) if not fmax: fmax = librosa.midi_to_hz(120) # First stage is a mel-frequency specrogram of bounded range. mel = librosa.feature.melspectrogram(audio, sr=sr, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels, fmax=fmax, fmin=fmin) # Second stage is log-amplitude; power is relative to peak in the signal. log_amplitude = librosa.logamplitude(mel, ref_power=np.max) # Third stage transposes the data so that frames become samples. # Its shape is: # (length of audio / frame duration, number of mel bands) transpose = np.transpose(log_amplitude) return (transpose, {'n_fft': n_fft, 'n_mels': n_mels, 'sr': sr, 'hop_length': hop_length, 'fmin': fmin, 'fmax': fmax})
def get_beat(y, PARAMETERS): '''Estimate beat times and tempo''' # Compute a log-power mel spectrogram on the percussive component S_p = librosa.feature.melspectrogram(y=y, sr=PARAMETERS['load']['sr'], n_fft=PARAMETERS['stft']['n_fft'], hop_length=PARAMETERS['beat']['hop_length'], n_mels=PARAMETERS['mel']['n_mels'], fmax=PARAMETERS['mel']['fmax']) S_p = librosa.logamplitude(S_p, ref_power=S_p.max()) # Compute the median onset aggregation odf = librosa.onset.onset_strength(S=S_p, aggregate=np.median) # Get beats tempo, beats = librosa.beat.beat_track(onset_envelope=odf, sr=PARAMETERS['load']['sr'], hop_length=PARAMETERS['beat']['hop_length']) beat_times = librosa.frames_to_time(beats, sr=PARAMETERS['load']['sr'], hop_length=PARAMETERS['beat']['hop_length']) return tempo, beat_times, odf
def amplitude_for_file(audio_path): y, sr = librosa.load(audio_path) # from http://bmcfee.github.io/librosa/librosa.html#librosa.core.logamplitude # Get a power spectrogram from a waveform y S = np.abs(librosa.stft(y)) ** 2 log_S = librosa.logamplitude(S) return log_S
def decompose(y, n_components=8): # How about something more advanced? Let's decompose a spectrogram with NMF, and then resynthesize an individual component D = librosa.stft(y) # Separate the magnitude and phase S, phase = librosa.magphase(D) # Decompose by nmf components, activations = librosa.decompose.decompose(S, n_components, sort=True) plt.figure(figsize=(12,4)) plt.subplot(1,2,1) librosa.display.specshow(librosa.logamplitude(components**2.0, ref_power=np.max), y_axis='log') plt.xlabel('Component') plt.ylabel('Frequency') plt.title('Components') plt.subplot(1,2,2) librosa.display.specshow(activations) plt.xlabel('Time') plt.ylabel('Component') plt.title('Activations') plt.tight_layout() plt.savefig('components_activations.png') print('components', components.shape) print('activations', activations.shape) return components, activations, phase
def analyzeAudios(): # librosa API reference: http://bmcfee.github.io/librosa/ audioNumber=4 filename=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat))[audioNumber] print('"'+filename+'"') sys.exit(0) y,sr=librosa.load(filename) onsets=librosa.onset.onset_detect(y,sr) fileoutName=filename.replace('.'+audioTargetFormat,'.png') fileoutName='test.png' #%matplotlib inline seaborn.set(style='ticks') S = librosa.feature.melspectrogram(y,sr=sr,n_mels=128) log_S = librosa.logamplitude(S, ref_power=np.max) fig = plt.figure(figsize=(12,4)) ax = fig.add_subplot(211) ax.contourf(log_S) plt.title('mel power spectrogram') #ax.annotate('$->$',xy=(2.,-1),xycoords='data', #xytext=(-150, -140), textcoords='offset points', #bbox=dict(boxstyle="round", fc="0.8"), #arrowprops=dict(arrowstyle="->",patchB=el, connectionstyle="angle,angleA=90,angleB=0,rad=10"),) ax = fig.add_subplot(212) ax.plot(onsets) #plt.colorbar(format='%+02.0f dB') plt.tight_layout() #plt.show() plt.savefig(fileoutName,format='png',dpi=900) print(fileoutName)
def post_process_features(gram, beats): ''' Apply processing to a feature matrix given the supplied param values Parameters ---------- gram : np.ndarray Feature matrix, shape (n_features, n_samples) beats : np.ndarray Indices of beat locations in gram Returns ------- gram : np.ndarray Feature matrix, shape (n_samples, n_features), post-processed according to the values in `params` ''' # Convert to chroma if params['feature'] == 'chroma': gram = librosa.feature.chroma_cqt( C=gram, fmin=librosa.midi_to_hz(create_data.NOTE_START)) # Beat-synchronize the feature matrix if params['beat_sync']: gram = librosa.feature.sync(gram, beats, pad=False) # Compute log magnitude gram = librosa.logamplitude(gram, ref_power=gram.max()) # Normalize the feature vectors gram = librosa.util.normalize(gram, norm=params['norm']) # Standardize the feature vectors if params['standardize']: gram = scipy.stats.mstats.zscore(gram, axis=1) # Transpose it to (n_samples, n_features) and return it return gram.T
def feature_extraction(y=None, fs=None, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): # Extract features, Mel Frequency Cepstral Coefficients eps = numpy.spacing(1) # Windowing function if mfcc_params['window'] == 'hamming_asymmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hamming_symmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) elif mfcc_params['window'] == 'hann_asymmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hann_symmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) else: window = None # Calculate Static Coefficients magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, n_fft=mfcc_params['n_fft'], win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], window=window))**2 mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk']) mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum)) # Collect the feature matrix feature_matrix = mfcc if include_delta: # Delta coefficients mfcc_delta = librosa.feature.delta(mfcc, **delta_params) # Add Delta Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) if include_acceleration: # Acceleration coefficients (aka delta) mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) # Add Acceleration Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) if not include_mfcc0: # Omit mfcc0 feature_matrix = feature_matrix[1:, :] feature_matrix = feature_matrix.T # Collect into data structure if statistics: return { 'feat': feature_matrix, 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } else: return { 'feat': feature_matrix}
def onsets(D): S = librosa.logamplitude(D) o = np.diff(S, axis=1) o = np.maximum(0, o) o = np.median(o, axis=0) o = o / o.max() return o
def delta_features(lowlevel): '''Log-mel power delta features''' M0 = librosa.logamplitude(lowlevel['mel_spectrogram']) M1 = librosa.feature.delta(M0) M2 = librosa.feature.delta(M1) return np.vstack([M0, M1, M2])
def analyze_frames(y, sr, debug=False): A = {} hop_length = 128 # First, get the track duration A['duration'] = float(len(y)) / sr # Then, get the beats if debug: print "> beat tracking" tempo, beats = librosa.beat.beat_track(y, sr, hop_length=hop_length) # Push the last frame as a phantom beat A['tempo'] = tempo A['beats'] = librosa.frames_to_time(beats, sr, hop_length=hop_length).tolist() if debug: print "beats count: ", len(A['beats']) if debug: print "> spectrogram" S = librosa.feature.melspectrogram(y, sr, n_fft=2048, hop_length=hop_length, n_mels=80, fmax=8000) S = S / S.max() # A['spectrogram'] = librosa.logamplitude(librosa.feature.sync(S, beats)**2).T.tolist() # Let's make some beat-synchronous mfccs if debug: print "> mfcc" S = librosa.feature.mfcc(librosa.logamplitude(S), n_mfcc=40) A['timbres'] = librosa.feature.sync(S, beats).T.tolist() if debug: print "timbres count: ", len(A['timbres']) # And some chroma if debug: print "> chroma" S = N.abs(librosa.stft(y, hop_length=hop_length)) # Grab the harmonic component H = librosa.decompose.hpss(S)[0] # H = librosa.hpss.hpss_median(S, win_P=31, win_H=31, p=1.0)[0] A['chroma'] = librosa.feature.sync(librosa.feature.chromagram(S=H, sr=sr), beats, aggregate=N.median).T.tolist() # Relative loudness S = S / S.max() S = S**2 if debug: print "> dists" dists = structure(N.vstack([N.array(A['timbres']).T, N.array(A['chroma']).T])) A['dense_dist'] = dists edge_lens = [A["beats"][i] - A["beats"][i - 1] for i in xrange(1, len(A["beats"]))] A["avg_beat_duration"] = N.mean(edge_lens) return A
def process(self, filename): y, sr = librosa.load(filename, 16000) # Let's make and display a mel-scaled power (energy-squared) spectrogram # We use a small hop length of 64 here so that the frames line up with the beat tracker example below. S = librosa.feature.melspectrogram(y, sr=sr, n_fft=2048, hop_length=64, n_mels=128) # Convert to log scale (dB). We'll use the peak power as reference. log_S = librosa.logamplitude(S, ref_power=np.max) # Make a new figure plt.figure(figsize=(12,4)) # Display the spectrogram on a mel scale # sample rate and hop length parameters are used to render the time axis librosa.display.specshow(log_S, sr=sr, hop_length=64, x_axis='time', y_axis='mel') # Put a descriptive title on the plot plt.title('mel power spectrogram') # draw a color bar plt.colorbar(format='%+02.0f dB') # Make the figure layout compact # plt.tight_layout() # Next, we'll extract the top 20 Mel-frequency cepstral coefficients (MFCCs) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=20) # Let's pad on the first and second deltas while we're at it delta_mfcc = librosa.feature.delta(mfcc) delta2_mfcc = librosa.feature.delta(mfcc, order=2) # How do they look? We'll show each in its own subplot plt.figure(figsize=(12, 6)) plt.subplot(3,1,1) librosa.display.specshow(mfcc) plt.ylabel('MFCC') plt.colorbar() plt.subplot(3,1,2) librosa.display.specshow(delta_mfcc) plt.ylabel('MFCC-$\Delta$') plt.colorbar() plt.subplot(3,1,3) librosa.display.specshow(delta2_mfcc, sr=sr, hop_length=64, x_axis='time') plt.ylabel('MFCC-$\Delta^2$') plt.colorbar() #plt.tight_layout() # For future use, we'll stack these together into one matrix M = np.vstack([mfcc, delta_mfcc, delta2_mfcc]) plt.show() return mfcc
def show_feature_superimposed(sound_files, genre, feature, binsize=1024, plot_on="waveform"): wavedata = sound_files[genre]["wavedata"] samplerate = sound_files[genre]["samplerate"] timestamps = sound_files[genre]["%s_timestamp" % (feature)] feature_data = sound_files[genre][feature] #TODO debug scale and remove if possible if feature == "sc": scale = 250.0 elif feature == "zcr": scale = 1000.0 elif feature == "rms": scale = 1000.0 elif feature == "sr": scale = 250.0 elif feature == "sf": scale = 250.0 # plot feature-data scaled_fd_y = timestamps * scale win = np.hanning(binsize) if len(wavedata.shape) > 1: wavedata = wavedata[:,0] D = lr.core.stft(wavedata, n_fft=binsize, window=win) fig, ax = plt.subplots(2, 1, sharex=False, figsize=(PLOT_WIDTH, 7), sharey=True) # show spectrogram plt.subplot(2, 1, 1) lr.display.specshow(lr.logamplitude(np.abs(D)**2, ref_power=np.max), sr=samplerate*2, y_axis='log', x_axis='time') if plot_on == "spectrogram": scaled_fd_x = feature_data _ = plt.plot(scaled_fd_y, scaled_fd_x, color='r', linewidth=1); #ax = plt.gca().set_yscale("log") # show waveform plt.subplot(2, 1, 2); lr.display.waveplot(normalize_wav(wavedata), sr=samplerate, alpha=0.75); if plot_on == "waveform": scaled_fd_x = (feature_data / np.max(feature_data)); _ = plt.plot(scaled_fd_y, scaled_fd_x, color='r', linewidth=1); ax = plt.gca() ax.axhline(y=0,c="green",linewidth=3,zorder=0) plt.tight_layout(); plt.show(); plt.clf();
def compute_features(audio_file, intervals, level): """Computes the subseg-sync cqt features from the given audio file, if they are not previously computed. Saves the results in the feat_dir folder. Parameters ---------- audio_file : str Path to the audio file. intervals : np.array Intervals containing the estimated boundaries. level : str Level in the hierarchy. Returns ------- cqgram : np.array Subseg-sync constant-Q power spectrogram. intframes : np.array The frame indeces. """ # Check if features have already been computed if level == "small_scale": features_file = os.path.join(features_dir, os.path.basename(audio_file).split('.')[0] + "_small_scale.mp3.pk") else: features_file = os.path.join(features_dir, os.path.basename(audio_file) + ".pk") if os.path.isfile(features_file): return read_features(features_file) y, sr = librosa.load(audio_file, sr=11025) # Default hopsize is 512 hopsize = 512 cqgram = librosa.logamplitude(librosa.cqt(y, sr=sr, hop_length=hopsize)**2, ref_power=np.max) # Track beats y_harmonic, y_percussive = librosa.effects.hpss(y) tempo, beats = librosa.beat.beat_track(y=y_percussive, sr=sr, hop_length=hopsize) # Synchronize cqgram = librosa.feature.sync(cqgram, beats, aggregate=np.median) intframes = None if intervals is not None: # convert intervals to frames intframes = librosa.time_to_frames(intervals, sr=sr, hop_length=hopsize) # Match intervals to subseg points intframes = librosa.util.match_events(intframes, beats) # Save the features save_features(cqgram, intframes, beats, features_file) return cqgram, intframes
def single_file_featurization(wavfile): ''' INPUT: row of dataframe with 'audio_slice_name' as the filename of the audio sample OUTPUT: feature vector for audio sample Function for dataframe apply for extracting each audio sample into a feature vector of mfcc coefficients ''' # print statements to update the progress of the processing try: # load the raw audio .wav file as a matrix using librosa wav_mat, sr = lr.load(wavfile, sr=sample_rate) # create the spectrogram using the predefined variables for mfcc extraction S = lr.feature.melspectrogram(wav_mat, sr=sr, n_mels=n_filters, fmax=sr/2, n_fft=window, hop_length=hop) # using the pre-defined spectrogram, extract the mfcc coefficients mfcc = lr.feature.mfcc(S=lr.logamplitude(S), n_mfcc=25) # calculate the first and second derivatives of the mfcc coefficients to detect changes and patterns mfcc_delta = lr.feature.delta(mfcc) mfcc_delta = mfcc_delta.T mfcc_delta2 = lr.feature.delta(mfcc, order=2) mfcc_delta2 = mfcc_delta2.T mfcc = mfcc.T # combine the mfcc coefficients and their derivatives in a column stack for analysis total_mfcc = np.column_stack((mfcc, mfcc_delta, mfcc_delta2)) # use the average of each column to condense into a feature vector # this makes each sample uniform regardless of the length of original the audio sample # the following features are extracted # - avg of mfcc, first derivative, second derivative # - var of mfcc, first derivative, second derivative # - max of mfcc # - min of mfcc # - median of mfcc # - skew of mfcc # - kurtosis of mfcc avg_mfcc = np.mean(total_mfcc, axis=0) var_mfcc = np.var(total_mfcc, axis=0) max_mfcc = np.max(mfcc, axis=0) min_mfcc = np.min(mfcc, axis=0) med_mfcc = np.median(mfcc, axis=0) skew_mfcc = skew(mfcc, axis=0) kurt_mfcc = skew(mfcc, axis=0) # combine into one vector and append to the total feature matrix return np.concatenate((avg_mfcc, var_mfcc, max_mfcc, min_mfcc, med_mfcc, skew_mfcc, kurt_mfcc)) except: print "Uhmmm something bad happened" return np.zeros(7)
def plot_spect(spec): plt.figure(figsize=(12, 8)) nb=len(spec) i=0 for s in spec: i+=1 plt.subplot(nb, 1, i) D = librosa.logamplitude(np.abs(s)**2, ref_power=np.max) librosa.display.specshow(D,y_axis='log', x_axis='time') plt.show()
def wiener_enhance(target, accomp, thresh=-6, transit=3, n_fft=2048): ''' Given a noisy signal and a signal which approximates the noise, try to remove the noise. Input: target - Noisy signal accomp - Approximate noise thresh - Sigmoid threshold, default -6 tranist - Sigmoid transition, default 3 n_fft - FFT length, default 2048 (hop is always n_fft/4) Output: filtered - Target, Wiener filtered to try to remove noise ''' target_spec = librosa.stft(target, n_fft=n_fft, hop_length=n_fft/4) accomp_spec = librosa.stft(accomp, n_fft=n_fft, hop_length=n_fft/4) spec_ratio = librosa.logamplitude(target_spec) - librosa.logamplitude(accomp_spec) spec_ratio = (spec_ratio - thresh)/transit mask = 0.5 + 0.5*(spec_ratio/np.sqrt(1 + spec_ratio**2)) return librosa.istft(target_spec*mask, hop_length=n_fft/4)
def get_beat_mfccs(filename): y, sr = librosa.load(filename) S = librosa.feature.melspectrogram(y, sr, n_fft=2048, hop_length=64, n_mels=128, fmax=8000) tempo, beats = librosa.beat.beat_track(y=y, sr=sr, hop_length=64) M = librosa.feature.mfcc(librosa.logamplitude(S), n_mfcc=32) M = librosa.feature.sync(M, beats) return M
def export_image(args): feat, tpl_info, h_key, layer = args harmonic_ins = tpl_info.harmonic_ins harmonic_chords = tpl_info.harmonic_chords sub_folder = '%d_%d/' % (layer ,feat) path_deconv_results = 'results/' path_img_results = 'images/' if not os.path.exists(path_img_results): os.makedirs(path_img_results) path_img_out = '%s%s/' % (path_img_results, sub_folder) path_img_out2= '%slayer-%d/' % (path_img_results, layer) if not os.path.exists(path_img_out): os.makedirs(path_img_out) if not os.path.exists(path_img_out2): os.makedirs(path_img_out2) img_name = '%d_%d_%s.png' % (layer, feat, h_key) if os.path.exists(path_img_out2 + img_name): return wav_name_suffix = '_deconved_from_depth_%d_feature_%d' % (layer, feat) fig, axes = plt.subplots(nrows=len(harmonic_ins), ncols=len(harmonic_chords), sharex='col', sharey='row') for inst_idx, h_inst in enumerate(harmonic_ins): for chord_idx, h_chord in enumerate(harmonic_chords): ax = axes[inst_idx][chord_idx] segment_name = '%s_%s_%s' % (h_key, h_inst, h_chord) path_wav = '%s%s/' % (path_deconv_results, segment_name) filename_wav = segment_name + wav_name_suffix src_here, sr = librosa.load(path_wav+filename_wav+'.wav', sr=SAMPLE_RATE, mono=True) SRC = librosa.stft(src_here, n_fft=N_FFT, hop_length=N_FFT/2) ax.imshow(librosa.logamplitude(np.flipud(np.abs(SRC))), aspect=200) ax.set_xticks([], []) ax.set_yticks([], []) ax.axis('auto') if chord_idx == 0: ax.set_ylabel(harmonic_ins[inst_idx][:6]) if inst_idx == len(harmonic_ins)-1: ax.set_xlabel(harmonic_chords[chord_idx]) fig.savefig(os.path.join(path_img_out, img_name), dpi=200, bbox_inches='tight') fig.savefig(os.path.join(path_img_out2, img_name), dpi=200, bbox_inches='tight') plt.close(fig) print '%s: done' % img_name return
def chroma(y): # Build the wrapper CQT = np.abs(librosa.cqt(y, sr=SR, resolution=NOTE_RES, hop_length=HOP_LENGTH, fmin=NOTE_MIN, n_bins=NOTE_NUM)) C_to_Chr = librosa.filters.cq_to_chroma(CQT.shape[0], n_chroma=N_CHROMA) return librosa.logamplitude(librosa.util.normalize(C_to_Chr.dot(CQT)))
def analyzeAudios2(): filenames=sorted(glob.glob(outputDir+'/*.'+audioTargetFormat)) for filename in filenames: for isuffix in ['harmonic','percussive','mfcc']: if re.search('\.'+isuffix+'\.'+audioTargetFormat+'$',filename): continue print(filename) y, sr = librosa.load(filename) #lenY=len(y) #idx1=min(int(20*sr),lenY) #idx2=min(int(24*sr),lenY) #y = y[idx1:idx2] #y_harmonic, y_percussive = librosa.effects.hpss(y) S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) log_S = librosa.logamplitude(S, ref_power=np.max) seaborn.set(style='ticks') fileoutName=filename.replace('.'+audioTargetFormat,'.melpower.png') plt.figure(figsize=(12,4)) librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') plt.title('mel power spectrogram') plt.colorbar(format='%+02.0f dB') plt.tight_layout() plt.savefig(fileoutName,format='png',dpi=300) # Next, we'll extract the top 13 Mel-frequency cepstral coefficients (MFCCs) mfcc = librosa.feature.mfcc(S=log_S, n_mfcc=13) delta_mfcc = librosa.feature.delta(mfcc) delta2_mfcc = librosa.feature.delta(mfcc, order=2) fileoutName=filename.replace('.'+audioTargetFormat,'.melcoeff.png') plt.figure(figsize=(12, 6)) plt.subplot(3,1,1) librosa.display.specshow(mfcc) plt.ylabel('MFCC') plt.colorbar() plt.subplot(3,1,2) librosa.display.specshow(delta_mfcc) plt.ylabel('MFCC-$\Delta$') plt.colorbar() plt.subplot(3,1,3) librosa.display.specshow(delta2_mfcc, sr=sr, x_axis='time') plt.ylabel('MFCC-$\Delta^2$') plt.colorbar() plt.tight_layout() plt.savefig(fileoutName,format='png',dpi=300) # For future use, we'll stack these together into one matrix M = np.vstack([mfcc, delta_mfcc, delta2_mfcc])
def get_mfcc(y): # Generate a mel-spectrogram S = librosa.feature.melspectrogram(y, sr, n_fft=N_FFT, hop_length=HOP_LENGTH, n_mels=N_MELS, fmax=FMAX).astype(np.float32) # Put on a log scale S = librosa.logamplitude(S, ref_power=S.max()) return librosa.feature.mfcc(S=S, n_mfcc=N_MFCC)
def example_librosa(): import matplotlib.pyplot as plt import specplotting import librosa audio_path = "./scratch/lab1-resources/gas_station.wav" sample_rate, s_in = read_wav_audio(audio_path) lr_y, lr_sr = librosa.load(audio_path, 16000) print("Librosa sr: ", lr_sr) samples = len(s_in) print("The file is %d samples long" % samples) print('The sample rate is %d Hz' % sample_rate) ms_per_sec = 1000.0 milliseconds = samples / sample_rate * ms_per_sec print('The file is %d milliseconds long' % milliseconds) inp = np.reshape(s_in, [1, s_in.shape[0], 1]) inplens = np.array([s_in.shape[0]]) g = tf.Graph() with g.as_default(): raw_waveforms = tf.placeholder(tf.float64, [None, None, 1], name="raw_waveforms") raw_waveform_lengths = tf.placeholder(tf.int32, [None], name="raw_waveform_lengths") N_fft = 512 audio = AudioPreprocessing(raw_waveforms, raw_waveform_lengths, 16000, 25.0, 10.0, N_fft=N_fft, channels=1) print(audio.frame_length_py) print(audio.N_fft_py) print(audio.frame_shift_py) S = librosa.core.stft(lr_y, n_fft=N_fft, hop_length=audio.frame_shift_py, win_length=audio.frame_length_py, window="hamming", center=True, pad_mode="constant") print(S.shape) # S = librosa.feature.melspectrogram(S=S, sr=lr_sr, n_mels=23) ref_fbank = librosa.logamplitude(S, amin=10**(-50)).T with tf.Session() as sess: sess.run(tf.global_variables_initializer()) feed_dict = { g.get_tensor_by_name("raw_waveforms:0"): inp, g.get_tensor_by_name("raw_waveform_lengths:0"): inplens, } out = sess.run( { "filterbank": audio.s_pe. log_magnitude_spectrogram, # audio.s_pe.log_mel_fbank_features, }, feed_dict=feed_dict) print(out["filterbank"][0, :, :, 0].shape) specplotting.plot_spec(out["filterbank"][0, :, :, 0], sample_rate=sample_rate, title="Mel Filterbank Energies (dB)") plt.ylabel("Feature") plt.show() specplotting.plot_spec( ref_fbank, sample_rate=sample_rate, title="Librosa Ref Mel Filterbank Energies (dB)") plt.ylabel("Feature") plt.show()
def build_datasets(train_percentage=0.8, preproc=False): if preproc: path = "Preproc3/" else: path = "../Music/" # TODO : replace by csv.get_tags("annotations_subset.csv") # class_names = get_class_names(path=path) # print("class_names = ", class_names) class_names = csv.get_tags() print("class_names = ", class_names) # TODO : rewrite get_total_files # total_files, total_train, total_test = get_total_files(path=path, train_percentage=train_percentage) # print("total files = ", total_files) # # nb_classes = len(class_names) # pre-allocate memory for speed (old method used np.concatenate, slow) mel_dims = get_sample_dimensions( path=path) # Find out the 'shape' of each data file filelist = csv.get_total_files() #TODO : return file list filelist_train = filelist[1:1000] filelist_test = filelist[1000:1100] filelist_train_test = filelist[1:1100] total_train = len(filelist_train) total_test = len(filelist_test) nb_classes = len(csv.get_tags()) X_train = np.zeros((total_train, mel_dims[1], mel_dims[2], mel_dims[3])) Y_train = np.zeros((total_train, nb_classes)) X_test = np.zeros((total_test, mel_dims[1], mel_dims[2], mel_dims[3])) Y_test = np.zeros((total_test, nb_classes)) paths_train = [] paths_test = [] train_count = 0 test_count = 0 for idx, file in enumerate(filelist_train_test): this_Y = np.array(csv.get_tag_np_vector( idx)) #TODO: return np.array (dim = tag number) audio_path = path + csv.get_file_path( idx) #TODO: return np.array (dim = tag number) n_files = len(filelist_train_test) n_load = n_files n_train = int(train_percentage * n_load) printevery = 100 if (0 == idx % printevery): print('\r Loading file: {:14s} ({:2d} of {:2d} classes)'.format( file, idx + 1, nb_classes), ", file ", idx + 1, " of ", n_load, ": ", audio_path, sep="") # start = timer() if (preproc): melgram = np.load(audio_path + ".npy") sr = 44100 else: aud, sr = librosa.load(audio_path, mono=mono, sr=None) melgram = librosa.logamplitude( librosa.feature.melspectrogram(aud, sr=sr, n_mels=96), ref_power=1.0)[np.newaxis, np.newaxis, :, :] melgram = melgram[:, :, :, 0:mel_dims[ 3]] # just in case files are differnt sizes: clip to first file size # end = timer() # print("time = ",end - start) if (idx < total_train): # concatenate is SLOW for big datasets; use pre-allocated instead # X_train = np.concatenate((X_train, melgram), axis=0) # Y_train = np.concatenate((Y_train, this_Y), axis=0) X_train[train_count, :, :] = melgram Y_train[train_count, :] = this_Y paths_train.append( audio_path) # list-appending is still fast. (??) train_count += 1 else: X_test[test_count, :, :] = melgram Y_test[test_count, :] = this_Y # X_test = np.concatenate((X_test, melgram), axis=0) # Y_test = np.concatenate((Y_test, this_Y), axis=0) paths_test.append(audio_path) test_count += 1 print("Shuffling order of data...") X_train, Y_train, paths_train = shuffle_XY_paths(X_train, Y_train, paths_train) X_test, Y_test, paths_test = shuffle_XY_paths(X_test, Y_test, paths_test) return X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr
def mfcc(self, audio_raw, plot=False): """Static MFCC Parameters ---------- audio_raw : numpy.ndarray Audio data Returns ------- list of numpy.ndarrays List of feature matrices, feature matrix per audio channel """ window = self._window_function( N=self.parameters['general'].get('win_length_samples'), window_type=self.parameters['mfcc'].get('window')) mel_basis = librosa.filters.mel( sr=self.parameters['general'].get('fs'), n_fft=self.parameters['mfcc'].get('n_fft'), n_mels=self.parameters['mfcc'].get('n_mels'), fmin=self.parameters['mfcc'].get('fmin'), fmax=self.parameters['mfcc'].get('fmax'), htk=self.parameters['mfcc'].get('htk')) if self.parameters['mfcc'].get('normalize_mel_bands'): mel_basis /= numpy.max(mel_basis, axis=-1)[:, None] # feature_matrix = [] # for channel in range(0, audio_raw.shape[0]): channel = 0 # Calculate Static Coefficients spectrogram_ = self._spectrogram( y=audio_raw[channel, :], n_fft=self.parameters['mfcc'].get('n_fft'), win_length_samples=self.parameters['general'].get( 'win_length_samples'), hop_length_samples=self.parameters['general'].get( 'hop_length_samples'), spectrogram_type=self.parameters['mfcc'].get('spectrogram_type') if 'spectrogram_type' in self.parameters['mfcc'] else 'power', center=True, window=window) mel_spectrum = numpy.dot(mel_basis, spectrogram_) # shape=(d, t) mfcc = librosa.feature.mfcc( S=librosa.logamplitude(mel_spectrum), n_mfcc=self.parameters['mfcc'].get('n_mfcc')) mfcc = mfcc.T # feature_matrix.append(mfcc.T) if plot: import matplotlib.pyplot as plt plt.subplot(1, 2, 1) plt.imshow( numpy.reshape(librosa.logamplitude(mel_spectrum), (self.parameters['mfcc'].get('n_mels'), -1))) plt.subplot(1, 2, 2) plt.imshow( numpy.reshape(mfcc, (self.parameters['mfcc'].get('n_mfcc'), -1))) plt.show() return mfcc
abs2_stft = (stft.real * stft.real) + (stft.imag * stft.imag) # Gather frequency bins according to the Mel scale. melspec = librosa.feature.melspectrogram( y=None, S=abs2_stft, sr=logmelspec_settings["sr"], n_fft=logmelspec_settings["n_fft"], n_mels=logmelspec_settings["n_mels"], htk=True, fmin=logmelspec_settings["fmin"], fmax=logmelspec_settings["fmax"]) # Apply pointwise base-10 logarithm. # The multiplication by 0.5 is to compensate for magnitude squaring. logmelspec = 0.5 * librosa.logamplitude(melspec, ref=1.0) # Convert to single floating-point precision. logmelspec = logmelspec.astype('float32') # Write to HDF5 dataset. # hop_start is an integer because chunk_start is both a multiple # of sample_rate and lms_hop_length = chunk_duration. hop_start = int((chunk_start * lms_sr) / (sample_rate * lms_hop_length)) n_hops_in_chunk = logmelspec.shape[1] hop_stop = min(hop_start + n_hops_in_chunk, n_hops) lms_dataset[:, hop_start:hop_stop] = logmelspec # Close file. out_file.close()
def __init__( self, path, suffix='', # required data file parameters subjects='all', # optional selector (list) or 'all' start_sample=0, stop_sample=None, # optional for selection of sub-sequences frame_size=-1, hop_size=-1, # values > 0 will lead to windowing label_mode='tempo', name='', # optional name n_fft=0, n_freq_bins=None, save_matrix_path=None, channels=None, resample=None, stimulus_id_filter=None, keep_metadata=False, spectrum_log_amplitude=False, spectrum_normalization_mode=None, ): ''' Constructor ''' self.name = name self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.datafiles = [] subject_paths = glob.glob(os.path.join(path, 'Sub*')) for path in subject_paths: dataset_filename = os.path.join(path, 'dataset' + suffix + '.pklz') if os.path.isfile(dataset_filename): log.debug('addding {}'.format(dataset_filename)) self.datafiles.append(dataset_filename) else: log.warn('file does not exists {}'.format(dataset_filename)) self.datafiles.sort() if subjects == 'all': subjects = np.arange(0, len(self.datafiles)) assert subjects is not None and len(subjects) > 0 self.label_mode = label_mode self.label_converter = LabelConverter() if stimulus_id_filter is None: stimulus_id_filter = [] self.stimulus_id_filter = stimulus_id_filter self.subject_partitions = [] # used to keep track of original subjects self.sequence_partitions = [] # used to keep track of original sequences self.trial_partitions = [] # keeps track of original trials # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 last_raw_label = -1 for i in xrange(len(self.datafiles)): if i in subjects: with log_timing( log, 'loading data from {}'.format(self.datafiles[i])): self.subject_partitions.append(n_sequences) # save start of next subject subject_sequences, subject_labels, channel_meta = load( self.datafiles[i]) subject_trial_no = -1 for j in xrange(len(subject_sequences)): l = subject_labels[j] # get raw label if l in stimulus_id_filter: # log.debug('skipping stimulus {}'.format(l)); continue c = channel_meta[j][0] if channels is not None and not c in channels: # apply optional channel filter log.debug('skipping channel {}'.format(c)) continue self.sequence_partitions.append(n_sequences) # save start of next sequence if l != last_raw_label: # if raw label changed... self.trial_partitions.append(n_sequences) # ...save start of next trial subject_trial_no += 1 # increment subject_trial_no counter last_raw_label = l l = self.label_converter.get_label( l[0], self.label_mode) # convert to label_mode view s = subject_sequences[j] s = s[start_sample:stop_sample] # get sub-sequence in original space # down-sample if requested if resample is not None and resample[0] != resample[1]: s = librosa.resample(s, resample[0], resample[1]) if n_fft is not None and n_fft > 0: # Optionally: # transform to spectogram hop_length = n_fft / 4 ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' s = np.abs( librosa.core.stft(s, n_fft=n_fft, hop_length=hop_length))**2 if n_freq_bins is not None: # Optionally: s = s[0:n_freq_bins, :] # cut off high bands if self.spectrum_log_amplitude: s = librosa.logamplitude(s) ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}' .format(self.spectrum_normalization_mode)) #print s.mean(axis=0) #print s.std(axis=0) # transpose to fit pylearn2 layout s = np.transpose(s) else: # normalize to max amplitude 1 s = librosa.util.normalize(s) s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s, l = self._split_sequence( s, l, frame_size, hop_size) # print s.shape n_sequences += len(s) sequences.append(s) labels.extend(l) if keep_metadata: self.metadata.append({ 'subject': i, # subject 'trial_no': subject_trial_no, # trial_no 'stimulus': last_raw_label[0], # stimulus 'channel': c, # channel 'start': self.sequence_partitions[-1], # start 'stop': n_sequences # stop }) # turn into numpy arrays sequences = np.vstack(sequences) # print sequences.shape; labels = np.hstack(labels) one_hot_y = one_hot(labels) self.labels = labels # save for later if n_fft > 0: sequences = np.array([sequences]) # re-arrange dimensions sequences = sequences.swapaxes(0, 1).swapaxes(1, 2).swapaxes(2, 3) log.debug('final dataset shape: {} (b,0,1,c)'.format( sequences.shape)) super(EEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) else: super(EEGDataset, self).__init__(X=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.debug( 'generated dataset "{}" with shape X={} y={} labels={} '.format( self.name, self.X.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(X=sequences, y=one_hot_y) with log_timing( log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
def prepare_set(dataset_name, set_name, normalize=True, with_factors=True, scaler=None): if not os.path.exists(common.PATCHES_DIR): os.makedirs(common.PATCHES_DIR) f = h5py.File(common.PATCHES_DIR+'/patches_%s_%s_%sx%s_tmp.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS),'w') spec_folder=common.SPECTRO_PATH+SPECTRO_FOLDER+"/" items = open(common.DATASETS_DIR+'/items_index_%s_%s.tsv' % (set_name, dataset_name)).read().splitlines() n_items = len(items) * N_SAMPLES print n_items x_dset = f.create_dataset("features", (n_items,1,N_FRAMES,N_BINS), dtype='f') i_dset = f.create_dataset("index", (n_items,), maxshape=(n_items,), dtype='S18') if with_factors: factors = np.load(common.DATASETS_DIR+'/y_%s_%s_%s.npy' % (set_name, Y_PATH,dataset_name)) y_dset = f.create_dataset("targets", (n_items,factors.shape[1]), dtype='f') k=0 itemset = [] itemset_index = [] for t,track_id in enumerate(items): if MSD: msd_folder = track_id[2]+"/"+track_id[3]+"/"+track_id[4]+"/" else: msd_folder = "" file = spec_folder+msd_folder+track_id+".pk" try: spec = pickle.load(open(file)) spec = librosa.logamplitude(np.abs(spec) ** 2,ref_power=np.max).T for i in range(0,N_SAMPLES): try: sample = sample_patch(spec,N_FRAMES) x_dset[k,:,:,:] = sample.reshape(-1,sample.shape[0],sample.shape[1]) if with_factors: y_dset[k,:] = factors[t] i_dset[k] = track_id itemset.append(track_id) itemset_index.append(t) k+=1 except Exception as e: print 'Error',e print file except Exception as e: print 'Error1',e if t%1000==0: print t print x_dset.shape # Clean empty spectrograms print "Cleaning empty spectrograms" f2 = h5py.File(common.PATCHES_DIR+'/patches_%s_%s_%sx%s.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS),'w') index = f['index'][:] index_clean = np.where(index != "")[0] n_items = len(index_clean) x_dset2 = f2.create_dataset("features", (n_items,1,N_FRAMES,N_BINS), dtype='f') i_dset2 = f2.create_dataset("index", (n_items,), maxshape=(n_items,), dtype='S18') for i in range(0,len(index_clean)): x_dset2[i] = x_dset[index_clean[i]] i_dset2[i] = i_dset[index_clean[i]] f.close() os.remove(common.PATCHES_DIR+'/patches_%s_%s_%sx%s_tmp.hdf5' % (set_name,dataset_name,N_SAMPLES,SECONDS)) # Normalize if normalize: print "Normalizing" block_step = 10000 for i in range(0,len(itemset),block_step): x_block = x_dset2[i:min(len(itemset),i+block_step)] x_norm, scaler = scale(x_block,scaler) x_dset2[i:min(len(itemset),i+block_step)] = x_norm scaler_file=common.PATCHES_DIR+'/scaler_%s_%sx%s.pk' % (DATASET_NAME,N_SAMPLES,SECONDS) pickle.dump(scaler,open(scaler_file,'wb')) return scaler
def mfcc(data, sr=22050, n_mfcc=20, **kwargs): S = logamplitude(melspectrogram(y=data, sr=sr, **kwargs)) return np.dot(dct(n_mfcc, S.shape[0]), S)
feats, beat_times = extractFeature(file_path, file_ext, feature, scale=1, round_to=0, normalize=0, beat_sync=beat_sync, transpose=False, save=True) else: feats = dd.io.load(save_path) beat_times = feats['beat_times'] feats = feats[feature] # Convert to db feats_log = librosa.logamplitude(feats, ref_power=feats.max()) # L2 normalize the columns, force features to lie on a sphere! feats_log_normed = librosa.util.normalize(feats_log, norm=2., axis=0) savemat(save_path[:-3] + '.mat', dict(feats_log=feats_log)) fig, axes = plt.subplots(3, 1, figsize=(18, 6)) axes[0].set_title(feature) axes[1].set_title('dB Feature') axes[2].set_title('Normed(dB Feature)') axes[0].imshow(feats, aspect='auto', origin='low', interpolation='nearest', cmap=plt.cm.plasma) axes[1].imshow(feats_log, aspect='auto',
def getSampleSSMs(): Kappa = 0.1 hopSize = 512 TempoBias1 = 180 TempoBias2 = 180 DPixels = 400 BeatsPerBlock = 8 p = np.arange(DPixels) [I, J] = np.meshgrid(p, p) FeatureParams = { 'MFCCBeatsPerBlock': BeatsPerBlock, 'MFCCSamplesPerBlock': 200, 'DPixels': DPixels, 'ChromaBeatsPerBlock': 20, 'ChromasPerBlock': 40 } CSMTypes = { 'MFCCs': 'Euclidean', 'SSMs': 'Euclidean', 'CurvsSS': 'Euclidean', 'TorsSS': 'Euclidean', 'D2s': 'EMD1D', 'Chromas': 'CosineOTI' } fin = open('covers32k/list1.list', 'r') files1 = [f.strip() for f in fin.readlines()] fin.close() fin = open('covers32k/list2.list', 'r') files2 = [f.strip() for f in fin.readlines()] fin.close() cmap = 'Spectral' #67 is a good male/female example for index in [11]: fileprefix = "Covers80%i" % index filename1 = "covers32k/" + files1[index] + ".mp3" filename2 = "covers32k/" + files2[index] + ".mp3" artist1 = getCovers80ArtistName(files1[index]) artist2 = getCovers80ArtistName(files2[index]) songName = getCovers80SongName(files1[index]) print("Getting features for %s..." % filename1) (XAudio1, Fs1) = getAudio(filename1) (tempo, beats1) = getBeats(XAudio1, Fs1, TempoBias1, hopSize) (Features1, O1) = getBlockWindowFeatures( (XAudio1, Fs1, tempo, beats1, hopSize, FeatureParams)) bRatio1 = float(Fs1) / hopSize print("Getting features for %s..." % filename2) (XAudio2, Fs2) = getAudio(filename2) (tempo, beats2) = getBeats(XAudio2, Fs2, TempoBias2, hopSize) (Features2, O2) = getBlockWindowFeatures( (XAudio2, Fs2, tempo, beats2, hopSize, FeatureParams)) bRatio2 = float(Fs2) / hopSize #Make SSM CSM plt.figure() CSM = getCSM(Features1['SSMs'], Features2['SSMs']) idx = plotCSM(CSM, artist1, artist2, songName) plt.savefig("DissertationFigures/CSM%i_SSM.svg" % index, bbox_inches='tight') D1 = np.zeros((DPixels, DPixels)) D1[I < J] = Features1['SSMs'][idx[0]] D1 = D1 + D1.T t1l = beats1[idx[0]] / bRatio1 t1r = beats1[idx[0] + BeatsPerBlock] / bRatio1 s1 = beats1[idx[0]] * hopSize s2 = beats1[idx[0] + BeatsPerBlock] * hopSize x1 = XAudio1[s1:s2] scipy.io.wavfile.write("DissertationFigures/%i_1.wav" % index, Fs1, x1) D2 = np.zeros((DPixels, DPixels)) D2[I < J] = Features2['SSMs'][idx[1]] D2 = D2 + D2.T t2l = beats2[idx[1]] / bRatio2 t2r = beats2[idx[1] + BeatsPerBlock] / bRatio2 s1 = beats2[idx[1]] * hopSize s2 = beats2[idx[1] + BeatsPerBlock] * hopSize x2 = XAudio2[s1:s2] scipy.io.wavfile.write("DissertationFigures/%i_2.wav" % index, Fs2, x2) #Plot spectrograms plt.clf() plt.figure(figsize=(12, 5)) plt.subplot(211) S1 = librosa.logamplitude(np.abs(librosa.stft(x1))) #librosa.display.specshow(S1, x_axis='time', y_axis='log') plt.subplot(212) S2 = librosa.logamplitude(np.abs(librosa.stft(x2))) #librosa.display.specshow(S2, x_axis='time', y_axis='log') plt.savefig("DissertationFigures/Spectrograms%i.svg" % index, bbox_inches='tight') #Plot SSMs plt.clf() plt.subplot(121) plt.title(artist1) plt.imshow(D1, interpolation='nearest', cmap=cmap, extent=(t1l, t1r, t1r, t1l)) plt.xlabel("Time (sec)") plt.ylabel("Time (sec)") plt.subplot(122) plt.title(artist2) plt.imshow(D2, interpolation='nearest', cmap=cmap, extent=(t2l, t2r, t2r, t2l)) plt.xlabel("Time (sec)") plt.ylabel("Time (sec)") plt.savefig("DissertationFigures/SSMs%i.svg" % index, bbox_inches='tight')
audioclip *= normalization_factor #how Karol does it s = 0 while True: window_wav = audioclip[(s * STEP_SIZE):( s * STEP_SIZE + TIME_WINDOW_SIZE)] #how Karol does it s += 1 if len(window_wav) < TIME_WINDOW_SIZE: break window_spcgm = librosa.feature.melspectrogram( window_wav, hop_length=512, n_fft=fft_window_len, sr=sampl_freq_Hz, n_mels=img_height) #how Karol does it window_spcgm = window_spcgm[:, : img_width] #for some reason the window_spcgm returned by window_spcgm has a width of 42 so we have to trim it to 41 window_spcgm = librosa.logamplitude( window_spcgm) #how Karol does it if np.mean(window_spcgm ) <= silence_threshold: #That's what Karol said too_quiet_ctr += 1 else: observations_spcgm = np.vstack( (observations_spcgm, [window_spcgm])) observations_wav = np.vstack((observations_wav, window_wav)) labels = np.hstack((labels, label_for_file)) #*np.ones(1, int) classPriorsAfterWindowing[label_for_file] += 1 tooShortList = classPriorsRaw - classPriorsBeforeWindowing tooShortList_mat[fold_num - 1] = tooShortList classPriorsRaw /= N classPriorsRaw_mat[fold_num - 1] = classPriorsRaw classPriorsBeforeWindowing /= N - np.sum(tooShortList)
def harmonic_index( sourcefile, offset=0.0, duration=120.0, key=None, output_dir=None, n_fft=4096, hop_length=1024, pitch_median=5, # how many frames for running medians? high_pass_f=40.0, low_pass_f=4000.0, debug=False, cached=True, n_peaks=16, **kwargs): """ Index spectral peaks """ if debug: from librosa.display import specshow import matplotlib.pyplot as plt # args that will make a difference to content, # apart from the sourcefile itself argset = dict( analysis="harmonic_index", # sourcefile=sourcefile, offset=offset, duration=duration, n_fft=n_fft, hop_length=hop_length, high_pass_f=high_pass_f, low_pass_f=low_pass_f, pitch_median=pitch_median, n_peaks=n_peaks, ) sourcefile = Path(sourcefile).resolve() if output_dir is None: output_dir = sourcefile.parent output_dir = Path(output_dir) if key is None: key = str(sourcefile.stem) + "___" + sfio.safeish_hash(argset) metadatafile = (output_dir / key).with_suffix(".json") if cached and metadatafile.exists(): return json.load(metadatafile.open("r")) metadata = dict(key=key, metadatafile=str(metadatafile), **argset) y, sr = sfio.load(str(sourcefile), sr=None, mono=True, offset=offset, duration=duration) if high_pass_f is not None: y = basicfilter.high_passed(y, sr, high_pass_f) dur = librosa.get_duration(y=y, sr=sr) metadata["dur"] = dur metadata["sr"] = sr # convert to spectral frames D = librosa.stft(y, n_fft=n_fft, hop_length=hop_length) y_rms = librosa.feature.rmse(S=D) # Separate into harmonic and percussive. I think this preserves phase? H, P = librosa.decompose.hpss(D) # Resynthesize the harmonic component as waveforms y_harmonic = librosa.istft(H) harmonicfile = str(output_dir / key) + ".harmonic.wav" sfio.save(harmonicfile, y_harmonic, sr=sr, norm=True) metadata["harmonicfile"] = harmonicfile # Now, power spectrogram H_mag, H_phase = librosa.magphase(H) H_peak_f, H_peak_mag = librosa.piptrack(S=H_mag, sr=sr, fmin=high_pass_f, fmax=low_pass_f) # First we smooth to use inter-bin information H_peak_f = median_filter(H_peak_f, size=(1, pitch_median)) H_peak_mag = median_filter(H_peak_mag, size=(1, pitch_median)) H_peak_power = np.real(H_peak_mag**2) H_rms = librosa.feature.rmse(S=H_peak_mag) if debug: plt.figure() specshow(librosa.logamplitude(H_peak_f, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak Freqs') plt.figure() specshow(librosa.logamplitude(H_peak_power, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak amps') plt.figure() # Now we pack down to the biggest few peaks: H_peak_f, H_peak_power = compress_peaks(H_peak_f, H_peak_power, n_peaks) if debug: plt.figure() specshow(librosa.logamplitude(H_peak_f, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak Freqs packed') plt.figure() specshow(librosa.logamplitude(H_peak_power, ref_power=np.max), y_axis='log', sr=sr) plt.title('Peak amps packed') # plt.figure() # plt.scatter( # librosa.logamplitude(H_peak_power, ref_power=np.max), # y_axis='log', # sr=sr) # plt.title('Compressed') return dict( metadata=metadata, peak_f=H_peak_f, peak_power=H_peak_power, rms=y_rms, harm_rms=H_rms, )
def doSpect(trackL=None, saveDir=None): #Use global S as counter for saved spects global S #Do nothing if test complete if S >= 5 and TEST: return False #Do nothing #Do we have a track path and genre? if trackL == None: print 'Missing Track information: [trackPath, genre]' return False fpath = str(trackL[0]) #File path genre = str(trackL[1]) #Track Genre #Split up the path string and get #the file name and extension tmp = fpath.split('/') tmp2 = str(tmp[-1]).split('.') fullFileName = tmp[-1] # filename.mp3 fileName = str(int(tmp2[0])) # filename (minus leading zeros) fileExt = tmp2[1] # .mp3/.png #Verify the file exists and is accessible if not os.path.isfile(fpath): #File doesn't exist or isn't accessible. print 'File: ' + fullFileName + ' does not exist or is not accessible\n' else: #Create Spectrogram (Modified from Joseph Kotva's Code) #Setup the save path if saveDir == None: savePath = 'sorted/spect/' + genre + '/' + fileName + '.png' else: savePath = saveDir + '/' + fileName + '.png' #Does the spectrogram already exist? Save time, skip it then if not os.path.exists(savePath): #Try to load the audio file using librosa print 'Attempting to load: ' + fpath try: data, sr = librosa.load(fpath, mono=True) #mono(1channel) except IOError: print 'Unable to load: ' + fpath + '\nSkipping...' #no s increment here because we didn't make the spectrogram! return False #Failure #continue #restart loop at next index, skip this file #Was the audio file somehow loaded yet has no data points? if data.size == 0: print 'Unable to load: ' + fpath + '\nFile was opened but there was no data! Corrupted?\nSkipping...' return False #Failure #continue #restart loop at next index, skip this file #Some calculations on the audio sample points stft = np.abs(librosa.stft(data, n_fft=2048, hop_length=512)) mel = librosa.feature.melspectrogram(sr=sr, S=stft**2) log_mel = librosa.logamplitude(mel) #print 'Generating Spectrogram for: '+ fpath #Create the spectrogram image librosa.display.specshow(log_mel, sr=sr, hop_length=512) plt.axis( "normal" ) #axis limits auto scaled to make image sit well in plot box. plt.margins(0, 0) #remove margins plt.gca().xaxis.set_major_locator( plt.NullLocator()) #remove x axis locator plt.gca().yaxis.set_major_locator( plt.NullLocator()) #remove y axis locator #Save the plotted figure (image) using "SortedVersion" dir structure #the image can/will be copied later into a "DataVersion" dir set. plt.savefig(savePath, dpi=100, frameon='false', bbox_inches="tight", pad_inches=0.0) plt.clf() #Clear the current figure (possibly helps with speed) S += 1 #Increment counter print 'Finished spectrogram(' + str(S) + '): ' + savePath if S == 5 and TEST: print 'Stopping spectrograms here, spect test done!' else: #The spectrogram already exists, skip it print savePath + ' already exists, skipping...' if not TEST: S += 1 #Keep counting though! return True
count += 1 if count == 0: continue print count if not os.path.exists('spectrograms/' + row[7]): os.makedirs('spectrograms/' + row[7]) y, sr = librosa.load("audio/fold" + str(row[5]) + "/" + str(row[0])) # Let's make and display a mel-scaled power (energy-squared) spectrogram S = librosa.feature.melspectrogram(y, sr=sr, n_mels=128) # Convert to log scale (dB). We'll use the peak power as reference. log_S = librosa.logamplitude(S, ref_power=np.max) # Make a new figure fig = plt.figure(figsize=(12, 4)) ax = plt.Axes(fig, [0., 0., 1., 1.]) ax.set_axis_off() fig.add_axes(ax) # Display the spectrogram on a mel scale # sample rate and hop length parameters are used to render the time axis librosa.display.specshow(log_S, sr=sr, x_axis='time', y_axis='mel') # Make the figure layout compact #plt.show() plt.savefig('spectrograms/' + row[7] + '/' + row[0] + '.png')
def dataset(modalities=0, forcetempTime=4, contactmicTime=0.2, leaveObjectOut=False, verbose=False): materials = ['plastic', 'glass', 'fabric', 'metal', 'wood', 'ceramic'] X = [] y = [] objects = dict() for m, material in enumerate(materials): if verbose: print 'Processing', material sys.stdout.flush() with open( 'data_processed/processed_0.1sbefore_%s_times_%.2f_%.2f.pkl' % (material, forcetempTime, contactmicTime), 'rb') as f: allData = pickle.load(f) for j, (objName, objData) in enumerate(allData.iteritems()): if leaveObjectOut: objects[objName] = {'x': [], 'y': []} X = objects[objName]['x'] y = objects[objName]['y'] for i in xrange(len(objData['temperature'])): y.append(m) if modalities > 2: # Mel-scaled power (energy-squared) spectrogram sr = 48000 S = librosa.feature.melspectrogram(np.array( objData['contact'][i]), sr=sr, n_mels=128) # Convert to log scale (dB) log_S = librosa.logamplitude(S, ref_power=np.max) if modalities == 0: X.append(objData['force0'][i] + objData['force1'][i]) elif modalities == 1: X.append(objData['temperature'][i]) elif modalities == 2: X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i]) elif modalities == 3: X.append(log_S.flatten()) elif modalities == 4: X.append(objData['temperature'][i] + log_S.flatten().tolist()) elif modalities == 5: X.append(objData['temperature'][i] + objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist()) elif modalities == 6: X.append(objData['force0'][i] + objData['force1'][i] + log_S.flatten().tolist()) if leaveObjectOut: return objects else: X = np.array(X) y = np.array(y) if verbose: print 'X:', np.shape(X), 'y:', np.shape(y) return X, y
A = np.sin(2*np.pi*np.arange(20992)*400/20000) #about 400 Hz B = np.sin(2*np.pi*np.arange(20992)*200/20000) #about 400 Hz C = A + B D = np.hstack((np.sin(2*np.pi*np.arange(10992)*200/20000), np.sin(2*np.pi*np.arange(10000)*400/20000))) D[:3500] = 0 D[14500:18000] = 0 E = np.sin(2*np.pi*np.arange(20992)*20/20000*150/20992*np.arange(1, 20993)) #wav_signals = np.vstack((A, B, C, D, E)) wav_signals = scipy.io.loadmat("data/fold10_RANDOM_OBS")['picked_obs'] all_spcgm = np.zeros((0, 60, 41), np.float64) for j in range(wav_signals.shape[0]): spcgm = librosa.feature.melspectrogram(wav_signals[j], hop_length=512, n_fft=1024, sr=22050, n_mels=60) #how Karol does it spcgm = spcgm[:, :41] #for some reason the spcgm returned by spcgm has a width of 42 so we have to trim it to 41 spcgm = librosa.logamplitude(spcgm) #how Karol does it all_spcgm = np.vstack((all_spcgm, [spcgm])) wav_signals = np.expand_dims(wav_signals, -1) synth = {x_pl_1: wav_signals} sess=tf.Session(config=tf.ConfigProto(gpu_options=gpu_opts)) sess.run(tf.global_variables_initializer()) if STORE_TEST_ERROR: res = sess.run(fetches=fetches_test, feed_dict=eat_this) activations_of_wav_signals = sess.run(a2, synth) sess.close() if STORE_TEST_ERROR: y_test_pred = res[0] test_loss = res[1] test_accuracy = res[2]
def __call__(self, S): return librosa.logamplitude(S, **self.__dict__)
def feature_extraction(y, fs=44100, statistics=True, include_mfcc0=True, include_delta=True, include_acceleration=True, mfcc_params=None, delta_params=None, acceleration_params=None): """Feature extraction, MFCC based features Outputs features in dict, format: { 'feat': feature_matrix [shape=(frame count, feature vector size)], 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } Parameters ---------- y: numpy.array [shape=(signal_length, )] Audio fs: int > 0 [scalar] Sample rate (Default value=44100) statistics: bool Calculate feature statistics for extracted matrix (Default value=True) include_mfcc0: bool Include 0th MFCC coefficient into static coefficients. (Default value=True) include_delta: bool Include delta MFCC coefficients. (Default value=True) include_acceleration: bool Include acceleration MFCC coefficients. (Default value=True) mfcc_params: dict or None Parameters for extraction of static MFCC coefficients. delta_params: dict or None Parameters for extraction of delta MFCC coefficients. acceleration_params: dict or None Parameters for extraction of acceleration MFCC coefficients. Returns ------- result: dict Feature dict """ eps = numpy.spacing(1) # Windowing function if mfcc_params['window'] == 'hamming_asymmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hamming_symmetric': window = scipy.signal.hamming(mfcc_params['n_fft'], sym=True) elif mfcc_params['window'] == 'hann_asymmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=False) elif mfcc_params['window'] == 'hann_symmetric': window = scipy.signal.hann(mfcc_params['n_fft'], sym=True) else: window = None # Calculate Static Coefficients magnitude_spectrogram = numpy.abs(librosa.stft(y + eps, n_fft=mfcc_params['n_fft'], win_length=mfcc_params['win_length'], hop_length=mfcc_params['hop_length'], center=True, window=window)) ** 2 mel_basis = librosa.filters.mel(sr=fs, n_fft=mfcc_params['n_fft'], n_mels=mfcc_params['n_mels'], fmin=mfcc_params['fmin'], fmax=mfcc_params['fmax'], htk=mfcc_params['htk']) mel_spectrum = numpy.dot(mel_basis, magnitude_spectrogram) mfcc = librosa.feature.mfcc(S=librosa.logamplitude(mel_spectrum), n_mfcc=mfcc_params['n_mfcc']) # Collect the feature matrix feature_matrix = mfcc if include_delta: # Delta coefficients mfcc_delta = librosa.feature.delta(mfcc, **delta_params) # Add Delta Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta)) if include_acceleration: # Acceleration coefficients (aka delta) mfcc_delta2 = librosa.feature.delta(mfcc, order=2, **acceleration_params) # Add Acceleration Coefficients to feature matrix feature_matrix = numpy.vstack((feature_matrix, mfcc_delta2)) if not include_mfcc0: # Omit mfcc0 feature_matrix = feature_matrix[1:, :] feature_matrix = feature_matrix.T # Collect into data structure if statistics: return { 'feat': feature_matrix, 'stat': { 'mean': numpy.mean(feature_matrix, axis=0), 'std': numpy.std(feature_matrix, axis=0), 'N': feature_matrix.shape[0], 'S1': numpy.sum(feature_matrix, axis=0), 'S2': numpy.sum(feature_matrix ** 2, axis=0), } } else: return { 'feat': feature_matrix}
def extract_features(parent_dir, sub_dirs, file_ext="*.wav",bands=60, frames=101, output=""): window_size = 512 * (frames-1) log_specgrams = [] labels = [] # 90% """ (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) (0, 60, 101, 2) """ # 50% """ (0, 60, 41, 2) (13173, 60, 41, 2) (13021, 60, 41, 2) (14168, 60, 41, 2) (14606, 60, 41, 2) (13727, 60, 41, 2) (12279, 60, 41, 2) (12769, 60, 41, 2) (11955, 60, 41, 2) (12371, 60, 41, 2) (12610, 60, 41, 2) """ for l, sub_dir in enumerate(sub_dirs): for fn in glob.glob(os.path.join(parent_dir, sub_dir, file_ext)): sound_clip, s = librosa.load(fn) label = fn.split('\\')[3].split('-')[1] # UrbanSound8K/audio/fold1/7061-6-0-0.wav for (start, end) in windows(sound_clip, window_size): if (len(sound_clip[start:end]) == window_size): signal = sound_clip[start:end] melspec = librosa.feature.melspectrogram(signal, n_mels=bands) logspec = librosa.logamplitude(melspec) logspec = logspec.T.flatten()[:, np.newaxis].T # 같은 배열에 대해 차원만 증가시키는 경우 [:, np.newaxis]를 사용한다. # logspec = (60,41) # logspec.T.flatten() = (41,60) -> (2460,) -> (2460,1) -> (1, 2460) log_specgrams.append(logspec) labels.append(label) log_specgrams = np.asarray(log_specgrams).reshape(len(log_specgrams), bands, frames, 1) features = np.concatenate((log_specgrams, np.zeros(np.shape(log_specgrams))), axis=3) # features # (5446,60,41,2) for i in range(len(features)): features[i, :, :, 1] = librosa.feature.delta(features[i, :, :, 0]) print(features.shape) np.savez("Extraction/audio" + output ,features=features,labels=labels) return np.array(features), np.array(labels)
def build_datasets(train_percentage=0.8, preproc=False): ''' So we make the training & testing datasets here, and we do it separately. Why not just make one big dataset, shuffle, and then split into train & test? because we want to make sure statistics in training & testing are as similar as possible ''' if (preproc): path = ROOT + "Preproc/" else: path = ROOT + "Samples/" class_names = get_class_names(path=path) print("class_names = ", class_names) total_files, total_train, total_test = get_total_files( path=path, train_percentage=train_percentage) print("total files = ", total_files) nb_classes = len(class_names) mel_dims = get_sample_dimensions(path=path) # pre-allocate memory for speed (old method used np.concatenate, slow) X_train = np.zeros((total_train, mel_dims[1], mel_dims[2], mel_dims[3])) Y_train = np.zeros((total_train, nb_classes)) X_test = np.zeros((total_test, mel_dims[1], mel_dims[2], mel_dims[3])) Y_test = np.zeros((total_test, nb_classes)) paths_train = [] paths_test = [] train_count = 0 test_count = 0 for idx, classname in enumerate(class_names): this_Y = np.array(encode_class(classname, class_names)) this_Y = this_Y[np.newaxis, :] class_files = os.listdir(path + classname) n_files = len(class_files) n_load = n_files n_train = int(train_percentage * n_load) printevery = 100 print("") for idx2, infilename in enumerate(class_files[0:n_load]): audio_path = path + classname + '/' + infilename if (0 == idx2 % printevery): print( '\r Loading class: {:14s} ({:2d} of {:2d} classes)'.format( classname, idx + 1, nb_classes), ", file ", idx2 + 1, " of ", n_load, ": ", audio_path, sep="") #start = timer() if (preproc): melgram = np.load(audio_path) sr = 44100 else: aud, sr = librosa.load(audio_path, mono=mono, sr=None) melgram = librosa.logamplitude( librosa.feature.melspectrogram(aud, sr=sr, n_mels=96), ref_power=1.0)[np.newaxis, np.newaxis, :, :] #end = timer() #print("time = ",end - start) melgram = melgram[:, :, :, 0:mel_dims[ 3]] # just in case files are differnt sizes: clip to first file size if (idx2 < n_train): # concatenate is SLOW for big datasets; use pre-allocated instead #X_train = np.concatenate((X_train, melgram), axis=0) #Y_train = np.concatenate((Y_train, this_Y), axis=0) X_train[train_count, :, :] = melgram Y_train[train_count, :] = this_Y paths_train.append( audio_path) # list-appending is still fast. (??) train_count += 1 else: X_test[test_count, :, :] = melgram Y_test[test_count, :] = this_Y #X_test = np.concatenate((X_test, melgram), axis=0) #Y_test = np.concatenate((Y_test, this_Y), axis=0) paths_test.append(audio_path) test_count += 1 print("") print("Shuffling order of data...") X_train, Y_train, paths_train = shuffle_XY_paths(X_train, Y_train, paths_train) X_test, Y_test, paths_test = shuffle_XY_paths(X_test, Y_test, paths_test) return X_train, Y_train, paths_train, X_test, Y_test, paths_test, class_names, sr
def create_spectrogram_plots( label_folder='electronic_music/Trance_label/Train/', sr=44100, n_mels=128, n_fft=2048, hop_length=512, song_duration=180.0, n_classes=4): """ Create a spectrogram from a randomly selected song for each artist and plot" :param label_folder: :param sr: :param n_mels: :param n_fft: :param hop_length: :param song_duration: :param n_classes: :return: """ # get list of all artists labels = os.listdir(label_folder) fig, ax = plt.subplots(nrows=2, ncols=int(n_classes / 2), figsize=(14, 12), sharex=True, sharey=True) row = 0 col = 0 # iterate through labels and random songs and plot a spectrogram on a grid for label in labels: # Randomly select album and song label_path = os.path.join(label_folder, label) label_songs = os.listdir(label_path) song = random.choice(label_songs) song_path = os.path.join(label_path, song) # Create mel spectrogram audio = MP3(song_path) audio_lenght = int(audio.info.length) audio_middle = (audio_lenght - int(song_duration)) / 2 y, sr = librosa.load(song_path, sr=sr, offset=audio_middle, duration=5) S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length) log_S = librosa.logamplitude(S, ref_power=1.0) # Plot on grid plt.axes(ax[row, col]) librosa.display.specshow(log_S, sr=sr) plt.title(label) col += 1 if col == int(n_classes / 2): row += 1 col = 0 fig.tight_layout()
# compute mean mean = np.mean(region, axis=1) # subtract mean out[:, frame] = X[:, frame] - mean # store noise noise[:, frame] = mean # zero negative values out[out < 0] = 0.0 # plot spectrum plt.figure() librosa.display.specshow(librosa.logamplitude(X), sr=sr, y_axis='linear') plt.title('before') plt.show() # plot noise reduced spectrogram plt.figure() librosa.display.specshow(librosa.logamplitude(out), sr=sr, y_axis='linear') plt.title('signal') plt.show() # plot mean / noise plt.figure() librosa.display.specshow(librosa.logamplitude(noise), sr=sr, y_axis='linear') plt.title('noise') plt.show()
def _compute_mfcc(self, audio) -> None: self.melspec = melspectrogram(audio.raw, sr=Clip.RATE, hop_length=Clip.FRAME) self.logamp = logamplitude(self.melspec) self.mfcc = mfcc(S=self.logamp, n_mfcc=13).transpose()
def get_gfb(filelist, config): # Read the filelist fp = open(filelist, 'r') flist = fp.read().splitlines() flist = filter(None, flist) # Create output directory if non-existant opdir = os.path.dirname(flist[0].split(',')[1]) if not os.path.exists(opdir): os.makedirs(opdir) # Read the relevant configs from the configfile framelen = float(config['framelen']) frameshift = float(config['frameshift']) wintype = config['wintype'] if wintype == 'rectangular': winfun = np.ones else: winfun = getattr(np, wintype) # Number of channels for gammatone filterbank if 'nbanks' in config: nbanks = int(config['nbanks']) else: raise ConfigError('nbanks parameter not set in config file') # Min frequency of Gammatone filterbank if 'min_freq' in config: min_freq = float(config['min_freq']) else: min_freq = 0 mvn = config['mvn'] mvn = mvn.upper() == 'TRUE' if 'std_frac' in config: std_frac = float(config['std_frac']) else: std_frac = 1.0 del1_flag = config['delta1'] del2_flag = config['delta2'] del1_flag = del1_flag.upper() == 'TRUE' del2_flag = del2_flag.upper() == 'TRUE' # Iterate over the filelist to extract features if mvn: feats_list = [] for iter1, fline in enumerate(flist): infnm = fline.split(',')[0] opfnm = fline.split(',')[1] sig, fs = librosa.load(infnm, sr=None) sig = sig / max(abs(sig)) dither = 1e-6 * np.random.rand(sig.shape[0]) sig = sig + dither win_length = int(fs * framelen * 0.001) hop_length = int(fs * frameshift * 0.001) feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001, nbanks, min_freq) # Code for amplitude range compression if config['compression'] == 'log': feats = librosa.logamplitude(feats) elif config['compression'][0:4] == 'root': rootval = float(config['compression'].split('_')[1]) feats = np.sign(feats) * (np.abs(feats)**(1 / rootval)) if np.sum(np.isnan(feats)): print('NaN Error in root compression for file: %s' % infnm) exit() if del1_flag: feats_del1 = librosa.feature.delta(feats, order=1, axis=1) if del2_flag: feats_del2 = librosa.feature.delta(feats, order=2, axis=1) if del1_flag: feats = np.concatenate((feats, feats_del1), axis=0) if del2_flag: feats = np.concatenate((feats, feats_del2), axis=0) feats_list.append(feats) all_feats = np.concatenate(feats_list, axis=1) f_mean = np.mean(all_feats, axis=1)[:, None] f_std = np.std(all_feats, axis=1)[:, None] opdir = os.path.dirname(opfnm) mvn_params = np.concatenate((f_mean, f_std), axis=1) postfix = os.path.basename(filelist).split('.')[0] np.save(opdir + '/mvn_params_' + postfix + '.npy', mvn_params) for iter1, fline in enumerate(flist): infnm = fline.split(',')[0] opfnm = fline.split(',')[1] sig, fs = librosa.load(infnm, sr=None) sig = sig / max(abs(sig)) dither = 1e-6 * np.random.rand(sig.shape[0]) sig = sig + dither win_length = int(fs * framelen * 0.001) hop_length = int(fs * frameshift * 0.001) feats = gtgram.gtgram(sig, fs, framelen * 0.001, frameshift * 0.001, nbanks, min_freq) if config['compression'] == 'log': feats = librosa.logamplitude(feats) elif config['compression'][0:4] == 'root': rootval = float(config['compression'].split('_')[1]) feats = np.sign(feats) * (np.abs(feats)**(1 / rootval)) if np.sum(np.isnan(feats)): print('NaN Error in root compression for file: %s' % infnm) exit() if del1_flag: feats_del1 = librosa.feature.delta(feats, order=1, axis=1) if del2_flag: feats_del2 = librosa.feature.delta(feats, order=2, axis=1) if del1_flag: feats = np.concatenate((feats, feats_del1), axis=0) if del2_flag: feats = np.concatenate((feats, feats_del2), axis=0) if mvn: feats = mvnormalize(feats, mvn_params, std_frac) writehtk(feats.T, frameshift, opfnm) fp.close()
def create_melspectrogram_dataset( label_folder='electronic_music/Trance_label/Train/', save_folder='song_mel_label_data', sr=44100, n_mels=128, n_fft=2048, hop_length=512, song_duration=180.0, create_data=False): """ This function creates the dataset given a folder with the correct structure (artist_folder/artists/albums/*.mp3) and saves it to a specified folder. :param label_folder: :param save_folder: :param sr: :param n_mels: :param n_fft: :param hop_length: :param song_duration: :param create_data: :return: """ if create_data: # get list of all labels os.makedirs(save_folder, exist_ok=True) labels = [ path for path in os.listdir(label_folder) if os.path.isdir(label_folder + path) ] # iterate through all lables, songs and find mel spectrogram for label in labels: print('{} \n'.format(label)) label_path = os.path.join(label_folder, label) label_songs = os.listdir(label_path) for song in label_songs: print(song) song_path = os.path.join(label_path, song) # Create mel spectrogram for song_duration in the middle of the song and convert it to the log scale audio = MP3(song_path) audio_lenght = int(audio.info.length) audio_middle = (audio_lenght - int(song_duration)) / 2 y, sr = librosa.load(song_path, sr=sr, offset=audio_middle, duration=song_duration) S = librosa.feature.melspectrogram(y, sr=sr, n_mels=n_mels, n_fft=n_fft, hop_length=hop_length) log_S = librosa.logamplitude(S, ref_power=1.0) data = (label, log_S, song) # Save each song save_name = label + '_%%-%%_' + song with open(os.path.join(save_folder, save_name), 'wb') as fp: dill.dump(data, fp)
############# STFT ######################### # # audio parameters sample_rate = 16000 n_fft = 512 # 32 ms frame (like in paper) hop_size = 128 # 75% overlap # ##training parameters gamma = 2 ## fft preprocessing wn_stft = librosa.core.stft(wn, n_fft, hop_size) y_stft = librosa.core.stft(y, n_fft, hop_size) x_stft = librosa.core.stft(x, n_fft, hop_size) log_spec_wn = librosa.logamplitude(np.abs(wn_stft))**gamma log_spec_y = librosa.logamplitude(np.abs(y_stft))**gamma log_spec_x = librosa.logamplitude(np.abs(x_stft))**gamma ## plot spectrograms # plt.figure(figsize=(12, 8)) # plt.subplot(1,2,1) # librosa.display.specshow(log_spec_wn,fs,hop_size,x_axis="time", y_axis="log") # plt.subplot(1,2,2) # librosa.display.specshow(log_spec_y,fs,hop_size,x_axis="time", y_axis="log") # plt.show() ## LOAD DICTIONARY # Load previously conputed dictionary: D = pickle.load(open("Dictionary_4atoms_10it.npy", "rb"))
def specgram(audio, n_fft=512, hop_length=None, mask=True, log_mag=True, re_im=False, dphase=True, mag_only=False): """Spectrogram using librosa. Args: audio: 1-D array of float32 sound samples. n_fft: Size of the FFT. hop_length: Stride of FFT. Defaults to n_fft/2. mask: Mask the phase derivative by the magnitude. log_mag: Use the logamplitude. re_im: Output Real and Imag. instead of logMag and dPhase. dphase: Use derivative of phase instead of phase. mag_only: Don't return phase. Returns: specgram: [n_fft/2 + 1, audio.size / hop_length, 2]. The first channel is the logamplitude and the second channel is the derivative of phase. """ if not hop_length: hop_length = int(n_fft / 2.) fft_config = dict(n_fft=n_fft, win_length=n_fft, hop_length=hop_length, center=True) spec = librosa.stft(audio, **fft_config) if re_im: re = spec.real[:, :, np.newaxis] im = spec.imag[:, :, np.newaxis] spec_real = np.concatenate((re, im), axis=2) else: mag, phase = librosa.core.magphase(spec) phase_angle = np.angle(phase) # Magnitudes, scaled 0-1 if log_mag: mag = (librosa.logamplitude( mag**2, amin=1e-13, top_db=120., ref_power=np.max) / 120.) + 1 else: mag /= mag.max() if dphase: # Derivative of phase phase_unwrapped = np.unwrap(phase_angle) p = phase_unwrapped[:, 1:] - phase_unwrapped[:, :-1] p = np.concatenate([phase_unwrapped[:, 0:1], p], axis=1) / np.pi else: # Normal phase p = phase_angle / np.pi # Mask the phase if log_mag and mask: p = mag * p # Return Mag and Phase p = p.astype(np.float32)[:, :, np.newaxis] mag = mag.astype(np.float32)[:, :, np.newaxis] if mag_only: spec_real = mag[:, :, np.newaxis] else: spec_real = np.concatenate((mag, p), axis=2) return spec_real
def __init__(self, path, name = '', # optional name # selectors subjects='all', # optional selector (list) or 'all' trial_types='all', # optional selector (list) or 'all' trial_numbers='all', # optional selector (list) or 'all' conditions='all', # optional selector (list) or 'all' partitioner = None, channel_filter = NoChannelFilter(), # optional channel filter, default: keep all channel_names = None, # optional channel names (for metadata) label_map = None, # optional conversion of labels remove_dc_offset = False, # optional subtraction of channel mean, usually done already earlier resample = None, # optional down-sampling # optional sub-sequences selection start_sample = 0, stop_sample = None, # optional for selection of sub-sequences # optional signal filter to by applied before spitting the signal signal_filter = None, # windowing parameters frame_size = -1, hop_size = -1, # values > 0 will lead to windowing hop_fraction = None, # alternative to specifying absolute hop_size # optional spectrum parameters, n_fft = 0 keeps raw data n_fft = 0, n_freq_bins = None, spectrum_log_amplitude = False, spectrum_normalization_mode = None, include_phase = False, flatten_channels=False, layout='tf', # (0,1)-axes layout tf=time x features or ft=features x time save_matrix_path = None, keep_metadata = False, ): ''' Constructor ''' # save params self.params = locals().copy() del self.params['self'] # print( self.params) # TODO: get the whole filtering into an extra class datafiles_metadata, metadb = load_datafiles_metadata(path) # print( datafiles_metadata) def apply_filters(filters, node): if isinstance(node, dict): filtered = [] keepkeys = filters[0] for key, value in node.items(): if keepkeys == 'all' or key in keepkeys: filtered.extend(apply_filters(filters[1:], value)) return filtered else: return node # [node] # keep only files that match the metadata filters self.datafiles = apply_filters([subjects,trial_types,trial_numbers,conditions], datafiles_metadata) # copy metadata for retained files self.metadb = {} for datafile in self.datafiles: self.metadb[datafile] = metadb[datafile] # print( self.datafiles) # print( self.metadb) self.name = name if partitioner is not None: self.datafiles = partitioner.get_partition(self.name, self.metadb) self.include_phase = include_phase self.spectrum_normalization_mode = spectrum_normalization_mode self.spectrum_log_amplitude = spectrum_log_amplitude self.sequence_partitions = [] # used to keep track of original sequences # metadata: [subject, trial_no, stimulus, channel, start, ] self.metadata = [] sequences = [] labels = [] n_sequences = 0 if frame_size > 0 and hop_size == -1 and hop_fraction is not None: hop_size = np.ceil(frame_size / hop_fraction) for i in xrange(len(self.datafiles)): with log_timing(log, 'loading data from {}'.format(self.datafiles[i])): # save start of next sequence self.sequence_partitions.append(n_sequences) data, metadata = load(os.path.join(path, self.datafiles[i])) label = metadata['label'] if label_map is not None: label = label_map[label] multi_channel_frames = [] # process 1 channel at a time for channel in xrange(data.shape[1]): # filter channels if not channel_filter.keep_channel(channel): continue samples = data[:, channel] # subtract channel mean if remove_dc_offset: samples -= samples.mean() # down-sample if requested if resample is not None and resample[0] != resample[1]: samples = librosa.resample(samples, resample[0], resample[1]) # apply optional signal filter after down-sampling -> requires lower order if signal_filter is not None: samples = signal_filter.process(samples) # get sub-sequence in resampled space # log.info('using samples {}..{} of {}'.format(start_sample,stop_sample, samples.shape)) samples = samples[start_sample:stop_sample] if n_fft is not None and n_fft > 0: # Optionally: ### frequency spectrum branch ### # transform to spectogram hop_length = n_fft / 4; ''' from http://theremin.ucsd.edu/~bmcfee/librosadoc/librosa.html >>> # Get a power spectrogram from a waveform y >>> S = np.abs(librosa.stft(y)) ** 2 >>> log_S = librosa.logamplitude(S) ''' S = librosa.core.stft(samples, n_fft=n_fft, hop_length=hop_length) # mag = np.abs(S) # magnitude spectrum mag = np.abs(S)**2 # power spectrum # include phase information if requested if self.include_phase: # phase = np.unwrap(np.angle(S)) phase = np.angle(S) # Optionally: cut off high bands if n_freq_bins is not None: mag = mag[0:n_freq_bins, :] if self.include_phase: phase = phase[0:n_freq_bins, :] if self.spectrum_log_amplitude: mag = librosa.logamplitude(mag) s = mag # for normalization ''' NOTE on normalization: It depends on the structure of a neural network and (even more) on the properties of data. There is no best normalization algorithm because if there would be one, it would be used everywhere by default... In theory, there is no requirement for the data to be normalized at all. This is a purely practical thing because in practice convergence could take forever if your input is spread out too much. The simplest would be to just normalize it by scaling your data to (-1,1) (or (0,1) depending on activation function), and in most cases it does work. If your algorithm converges well, then this is your answer. If not, there are too many possible problems and methods to outline here without knowing the actual data. ''' ## normalize to mean 0, std 1 if self.spectrum_normalization_mode == 'mean0_std1': # s = preprocessing.scale(s, axis=0); mean = np.mean(s) std = np.std(s) s = (s - mean) / std ## normalize by linear transform to [0,1] elif self.spectrum_normalization_mode == 'linear_0_1': s = s / np.max(s) ## normalize by linear transform to [-1,1] elif self.spectrum_normalization_mode == 'linear_-1_1': s = -1 + 2 * (s - np.min(s)) / (np.max(s) - np.min(s)) elif self.spectrum_normalization_mode is not None: raise ValueError( 'unsupported spectrum normalization mode {}'.format( self.spectrum_normalization_mode) ) #print( s.mean(axis=0)) #print( s.std(axis=0)) # include phase information if requested if self.include_phase: # normalize phase to [-1.1] phase = phase / np.pi s = np.vstack([s, phase]) # transpose to fit pylearn2 layout s = np.transpose(s) # print( s.shape) ### end of frequency spectrum branch ### else: ### raw waveform branch ### # normalize to max amplitude 1 s = librosa.util.normalize(samples) # add 2nd data dimension s = s.reshape(s.shape[0], 1) # print( s.shape) ### end of raw waveform branch ### s = np.asfarray(s, dtype='float32') if frame_size > 0 and hop_size > 0: s = s.copy() # FIXME: THIS IS NECESSARY IN MultiChannelEEGSequencesDataset - OTHERWISE, THE FOLLOWING OP DOES NOT WORK!!!! frames = frame(s, frame_length=frame_size, hop_length=hop_size) else: frames = s del s # print( frames.shape) if flatten_channels: # add artificial channel dimension frames = frames.reshape((frames.shape[0], frames.shape[1], frames.shape[2], 1)) # print( frames.shape) sequences.append(frames) # increment counter by new number of frames n_sequences += frames.shape[0] if keep_metadata: # determine channel name channel_name = None if channel_names is not None: channel_name = channel_names[channel] elif 'channels' in metadata: channel_name = metadata['channels'][channel] self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : channel, # channel 'channel_name' : channel_name, 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(frames.shape[0]): labels.append(label) else: multi_channel_frames.append(frames) ### end of channel iteration ### if not flatten_channels: # turn list into array multi_channel_frames = np.asfarray(multi_channel_frames, dtype='float32') # [channels x frames x time x freq] -> cb01 # [channels x frames x time x 1] -> cb0. # move channel dimension to end multi_channel_frames = np.rollaxis(multi_channel_frames, 0, 4) # print( multi_channel_frames.shape) # log.debug(multi_channel_frames.shape) sequences.append(multi_channel_frames) # increment counter by new number of frames n_sequences += multi_channel_frames.shape[0] if keep_metadata: self.metadata.append({ 'subject' : metadata['subject'], # subject 'trial_type': metadata['trial_type'], # trial_type 'trial_no' : metadata['trial_no'], # trial_no 'condition' : metadata['condition'], # condition 'channel' : 'all', # channel 'start' : self.sequence_partitions[-1], # start 'stop' : n_sequences # stop }) for _ in xrange(multi_channel_frames.shape[0]): labels.append(label) ### end of datafile iteration ### # turn into numpy arrays sequences = np.vstack(sequences) # print( sequences.shape;) labels = np.hstack(labels) # one_hot_y = one_hot(labels) one_hot_formatter = OneHotFormatter(labels.max() + 1) # FIXME! one_hot_y = one_hot_formatter.format(labels) self.labels = labels if layout == 'ft': # swap axes to (batch, feature, time, channels) sequences = sequences.swapaxes(1, 2) log.debug('final dataset shape: {} (b,0,1,c)'.format(sequences.shape)) super(MultiChannelEEGDataset, self).__init__(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) log.info('generated dataset "{}" with shape X={}={} y={} labels={} '. format(self.name, self.X.shape, sequences.shape, self.y.shape, self.labels.shape)) if save_matrix_path is not None: matrix = DenseDesignMatrix(topo_view=sequences, y=one_hot_y, axes=['b', 0, 1, 'c']) with log_timing(log, 'saving DenseDesignMatrix to {}'.format(save_matrix_path)): serial.save(save_matrix_path, matrix)
print(S.shape) # make pictures name save_path = 'part1_spectrogram.jpg' # S = pd.read_csv(fp) filter_col = [col for col in S if col.startswith('mel')] S_dat = S[filter_col] print(S.onset) # Convert to log scale (dB). We'll use the peak power as reference. log_S = librosa.logamplitude(np.array(S_dat).T, ref_power=np.max) # Make a new figure plt.figure(figsize=(12,4)) # Display the spectrogram on a mel scale # sample rate and hop length parameters are used to render the time axis librosa.display.specshow(log_S, x_axis='time', y_axis='mel') # Put a descriptive title on the plot plt.title('mel power spectrogram') # draw a color bar plt.colorbar(format='%+02.0f dB') # Make the figure layout compact
def analyze_frames(y, sr, debug=False): A = {} hop_length = 128 # First, get the track duration A['duration'] = float(len(y)) / sr # Then, get the beats if debug: print "> beat tracking" tempo, beats = librosa.beat.beat_track(y, sr, hop_length=hop_length) # Push the last frame as a phantom beat A['tempo'] = tempo A['beats'] = librosa.frames_to_time(beats, sr, hop_length=hop_length).tolist() if debug: print "beats count: ", len(A['beats']) if debug: print "> spectrogram" S = librosa.feature.melspectrogram(y, sr, n_fft=2048, hop_length=hop_length, n_mels=80, fmax=8000) S = S / S.max() # A['spectrogram'] = librosa.logamplitude(librosa.feature.sync(S, beats)**2).T.tolist() # Let's make some beat-synchronous mfccs if debug: print "> mfcc" S = librosa.feature.mfcc(S=librosa.logamplitude(S), n_mfcc=40) A['timbres'] = librosa.feature.sync(S, beats).T.tolist() if debug: print "timbres count: ", len(A['timbres']) # And some chroma if debug: print "> chroma" S = np.abs(librosa.stft(y, hop_length=hop_length)) # Grab the harmonic component H = librosa.decompose.hpss(S)[0] # H = librosa.hpss.hpss_median(S, win_P=31, win_H=31, p=1.0)[0] A['chroma'] = librosa.feature.sync(librosa.feature.chromagram(S=H, sr=sr), beats, aggregate=np.median).T.tolist() # Relative loudness S = S / S.max() S = S**2 if debug: print "> dists" dists = structure( np.vstack([np.array(A['timbres']).T, np.array(A['chroma']).T])) A['dense_dist'] = dists edge_lens = [ A["beats"][i] - A["beats"][i - 1] for i in xrange(1, len(A["beats"])) ] A["avg_beat_duration"] = np.mean(edge_lens) A["med_beat_duration"] = np.median(edge_lens) return A
def CQT(y, sr=44100, cqt_hop=1024, seconds=2.0, n_bins=30, bins_per_octave=4, fmin=27.5, use_han=False): """ Get the constant-q transform of the audio file. Takes ((seconds*sr)//cqt_hop) * cqt_hop sample long chunks of the audiofile before doing the cqt computation. Hop length between these chunks is frame_length - cqt_hop, where frame_length is the size of the chunks of the audiofile. These chunks are necessary because librosa's cqt function can only handle short duration audio files in a reasonable amount of time. Parameters ---------- cqt_hop : integer. The hop length between adjacent frames for when extracting the cqt feature. seconds : float. The time window to intially chunk the audio file into before feeding into the librosa cqt function. n_bins : integer. The number of cqt frequency bands to extract. bins_per_octave : interger. The number of cqt frequency bands that comprise an octave. The number of octaves is n_bins/float(bins_per_octave). fmin : integer. The lowest frequency in the range of frequencies covered by the constant q transform. use_han : boolean. True, window each frame with a hanning window before extracting CQT. Returns ------- CQTlog : np.ndarray [shape=(n_bins, n)] The time series of the constant-q transform of the audio file. Notes ----- As of 06/22/2016, librosa's util.frame() function already applies a hanning window. Examples -------- >>> # Load a file >>> y, sr = librosa.load('file.mp3') >>> # Calculate the constant q transform of a time-series >>> CQTlog = extractor.CQT(y, sr=sr, ...) """ frame_length = seconds * sr frame_length = (frame_length // cqt_hop) * cqt_hop frame_hop = frame_length - cqt_hop padded_y = np.append(y, np.zeros(frame_length)) y_frames = librosa.util.frame(padded_y, frame_length=frame_length, hop_length=frame_hop) if use_han: han_win = signal.hanning(frame_length) CQT_frames = [] for frame in range(y_frames.shape[1]): if not use_han: sig = y_frames[:, frame] else: sig = y_frames[:, frame] * han_win CQTf = np.abs( librosa.cqt(sig, sr=sr, n_bins=n_bins, hop_length=cqt_hop, bins_per_octave=bins_per_octave, fmin=fmin)) CQT_frames.append(CQTf[:, 1:-1]) CQT = np.hstack(CQT_frames) CQTlog = librosa.logamplitude(CQT**2, ref_power=np.max) return CQTlog