def process(self): """Main process. Returns ------- est_idxs : np.array(N) or list Estimated times for the segment boundaries in frame indeces. List if hierarchical segmentation. est_labels : np.array(N-1) or list Estimated labels for the segments. List if hierarchical segmentation. """ # This algorithm only accepts one specific kind of features: # Combination of PCP + MFCC. Let's get them: pcp_obj = Features.select_features("pcp", self.file_struct, self.annot_beats, self.framesync) mfcc_obj = Features.select_features("mfcc", self.file_struct, self.annot_beats, self.framesync) # Get frame times and make sure they're the same in both features frame_times = pcp_obj.frame_times assert np.array_equal(frame_times, mfcc_obj.frame_times) # Brian wants PCP and MFCC # (tranpsosed, because he's that kind of person) F = (pcp_obj.features.T, mfcc_obj.features.T) # Do actual segmentation est_idxs, est_labels = main.do_segmentation(F, frame_times, self.config, self.in_bound_idxs) return est_idxs, est_labels, F
def process(self): """Main process. Returns ------- est_idxs : np.array(N) or list Estimated times for the segment boundaries in frame indeces. List if hierarchical segmentation. est_labels : np.array(N-1) or list Estimated labels for the segments. List if hierarchical segmentation. """ # This algorithm only accepts one specific kind of features: # Combination of PCP + MFCC. Let's get them: pcp_obj = Features.select_features( "pcp", self.file_struct, self.annot_beats, self.framesync) mfcc_obj = Features.select_features( "mfcc", self.file_struct, self.annot_beats, self.framesync) # Get frame times and make sure they're the same in both features frame_times = pcp_obj.frame_times assert np.array_equal(frame_times, mfcc_obj.frame_times) # Brian wants PCP and MFCC # (tranpsosed, because he's that kind of person) F = (pcp_obj.features.T, mfcc_obj.features.T) # Do actual segmentation est_idxs, est_labels = main.do_segmentation( F, frame_times, self.config, self.in_bound_idxs) return est_idxs, est_labels, F
def import_data(file_struct, rootpath, output_path, annot_beats): msaf.utils.ensure_dir(output_path) msaf.utils.ensure_dir(os.path.join(output_path, "features")) data_file = '%s/features/%s_annotbeatsE%d.pickle' % \ (output_path, os.path.splitext( os.path.basename(file_struct.audio_file))[0], annot_beats) if os.path.exists(data_file): with open(data_file, 'r') as f: Data = pickle.load(f) print file_struct.audio_file, 'cached!' else: X, dur = features(file_struct, annot_beats) pcp_obj = Features.select_features("pcp", file_struct, annot_beats, framesync=False) B = pcp_obj.frame_times[:pcp_obj.features.shape[0]] if X is None: return X Y, T, L = align_segmentation( get_annotation(file_struct.audio_file, rootpath), B, file_struct.audio_file) if Y is None: return Y Data = { 'features': X, 'beats': B, 'filename': file_struct.audio_file, 'segment_times': T, 'segment_labels': L, 'segments': Y } print file_struct.audio_file, 'processed!' with open(data_file, 'w') as f: pickle.dump(Data, f) return Data
def import_data(file_struct, rootpath, output_path, annot_beats): msaf.utils.ensure_dir(output_path) msaf.utils.ensure_dir(os.path.join(output_path, "features")) data_file = '%s/features/%s_annotbeatsE%d.pickle' % \ (output_path, os.path.splitext( os.path.basename(file_struct.audio_file))[0], annot_beats) if os.path.exists(data_file): with open(data_file, 'r') as f: Data = pickle.load(f) print file_struct.audio_file, 'cached!' else: X, dur = features(file_struct, annot_beats) pcp_obj = Features.select_features("pcp", file_struct, annot_beats, framesync=False) B = pcp_obj.frame_times[:pcp_obj.features.shape[0]] if X is None: return X Y, T, L = align_segmentation( get_annotation(file_struct.audio_file, rootpath), B, file_struct.audio_file) if Y is None: return Y Data = {'features': X, 'beats': B, 'filename': file_struct.audio_file, 'segment_times': T, 'segment_labels': L, 'segments': Y} print file_struct.audio_file, 'processed!' with open(data_file, 'w') as f: pickle.dump(Data, f) return Data
def features(file_struct, annot_beats=False, framesync=False): '''Feature-extraction for audio segmentation Arguments: file_struct -- msaf.io.FileStruct paths to the input files in the Segmentation dataset Returns: - X -- ndarray beat-synchronous feature matrix: MFCC (mean-aggregated) Chroma (median-aggregated) Latent timbre repetition Latent chroma repetition Time index Beat index - dur -- float duration of the track in seconds ''' def compress_data(X, k): Xtemp = X.dot(X.T) if len(Xtemp) == 0: return None e_vals, e_vecs = np.linalg.eig(Xtemp) e_vals = np.maximum(0.0, np.real(e_vals)) e_vecs = np.real(e_vecs) idx = np.argsort(e_vals)[::-1] e_vals = e_vals[idx] e_vecs = e_vecs[:, idx] # Truncate to k dimensions if k < len(e_vals): e_vals = e_vals[:k] e_vecs = e_vecs[:, :k] # Normalize by the leading singular value of X Z = np.sqrt(e_vals.max()) if Z > 0: e_vecs = e_vecs / Z return e_vecs.T.dot(X) # Latent factor repetition features def repetition(X, metric='euclidean'): R = librosa.segment.recurrence_matrix( X, k=2 * int(np.ceil(np.sqrt(X.shape[1]))), width=REP_WIDTH, metric=metric, sym=False).astype(np.float32) P = scipy.signal.medfilt2d(librosa.segment.recurrence_to_lag(R), [1, REP_FILTER]) # Discard empty rows. # This should give an equivalent SVD, but resolves some numerical # instabilities. P = P[P.any(axis=1)] return compress_data(P, N_REP) ######### # '\tloading annotations and features of ', audio_path pcp_obj = Features.select_features("pcp", file_struct, annot_beats, framesync) mfcc_obj = Features.select_features("mfcc", file_struct, annot_beats, framesync) chroma = pcp_obj.features mfcc = mfcc_obj.features beats = pcp_obj.frame_times dur = pcp_obj.dur # Sampling Rate sr = msaf.config.sample_rate ########## # print '\treading beats' B = beats[:chroma.shape[0]] # beat_frames = librosa.time_to_frames(B, sr=sr, #hop_length=msaf.config.hop_size) #print beat_frames, len(beat_frames), uidx ######### M = mfcc.T #plt.imshow(M, interpolation="nearest", aspect="auto"); plt.show() ######### # Get the beat-sync chroma C = chroma.T C += C.min() + 0.1 C = C / C.max(axis=0) C = 80 * np.log10(C) # Normalize from -80 to 0 #plt.imshow(C, interpolation="nearest", aspect="auto"); plt.show() # Time-stamp features N = np.arange(float(chroma.shape[0])) ######### #print '\tgenerating structure features' # TODO: This might fail if audio file (or number of beats) is too small R_timbre = repetition(librosa.feature.stack_memory(M)) R_chroma = repetition(librosa.feature.stack_memory(C)) if R_timbre is None or R_chroma is None: return None, dur R_timbre += R_timbre.min() R_timbre /= R_timbre.max() R_chroma += R_chroma.min() R_chroma /= R_chroma.max() #plt.imshow(R_chroma, interpolation="nearest", aspect="auto"); plt.show() # Stack it all up #print M.shape, C.shape, R_timbre.shape, R_chroma.shape, len(B), len(N) X = np.vstack([M, C, R_timbre, R_chroma, B, B / dur, N, N / float(chroma.shape[0])]) #plt.imshow(X, interpolation="nearest", aspect="auto"); plt.show() return X, dur
def features(file_struct, annot_beats=False, framesync=False): '''Feature-extraction for audio segmentation Arguments: file_struct -- msaf.io.FileStruct paths to the input files in the Segmentation dataset Returns: - X -- ndarray beat-synchronous feature matrix: MFCC (mean-aggregated) Chroma (median-aggregated) Latent timbre repetition Latent chroma repetition Time index Beat index - dur -- float duration of the track in seconds ''' def compress_data(X, k): Xtemp = X.dot(X.T) if len(Xtemp) == 0: return None e_vals, e_vecs = np.linalg.eig(Xtemp) e_vals = np.maximum(0.0, np.real(e_vals)) e_vecs = np.real(e_vecs) idx = np.argsort(e_vals)[::-1] e_vals = e_vals[idx] e_vecs = e_vecs[:, idx] # Truncate to k dimensions if k < len(e_vals): e_vals = e_vals[:k] e_vecs = e_vecs[:, :k] # Normalize by the leading singular value of X Z = np.sqrt(e_vals.max()) if Z > 0: e_vecs = e_vecs / Z return e_vecs.T.dot(X) # Latent factor repetition features def repetition(X, metric='euclidean'): R = librosa.segment.recurrence_matrix( X, k=2 * int(np.ceil(np.sqrt(X.shape[1]))), width=REP_WIDTH, metric=metric, sym=False).astype(np.float32) P = scipy.signal.medfilt2d(librosa.segment.structure_feature(R), [1, REP_FILTER]) # Discard empty rows. # This should give an equivalent SVD, but resolves some numerical # instabilities. P = P[P.any(axis=1)] return compress_data(P, N_REP) ######### # '\tloading annotations and features of ', audio_path pcp_obj = Features.select_features("pcp", file_struct, annot_beats, framesync) mfcc_obj = Features.select_features("mfcc", file_struct, annot_beats, framesync) chroma = pcp_obj.features mfcc = mfcc_obj.features beats = pcp_obj.frame_times dur = pcp_obj.dur # Sampling Rate sr = msaf.config.sample_rate ########## #print '\treading beats' B = beats[:chroma.shape[0]] #beat_frames = librosa.time_to_frames(B, sr=sr, #hop_length=msaf.config.hop_size) #print beat_frames, len(beat_frames), uidx ######### M = mfcc.T #plt.imshow(M, interpolation="nearest", aspect="auto"); plt.show() ######### # Get the beat-sync chroma C = chroma.T C += C.min() + 0.1 C = C / C.max(axis=0) C = 80 * np.log10(C) # Normalize from -80 to 0 #plt.imshow(C, interpolation="nearest", aspect="auto"); plt.show() # Time-stamp features N = np.arange(float(chroma.shape[0])) ######### #print '\tgenerating structure features' # TODO: This might fail if audio file (or number of beats) is too small R_timbre = repetition(librosa.feature.stack_memory(M)) R_chroma = repetition(librosa.feature.stack_memory(C)) if R_timbre is None or R_chroma is None: return None, dur R_timbre += R_timbre.min() R_timbre /= R_timbre.max() R_chroma += R_chroma.min() R_chroma /= R_chroma.max() #plt.imshow(R_chroma, interpolation="nearest", aspect="auto"); plt.show() # Stack it all up #print M.shape, C.shape, R_timbre.shape, R_chroma.shape, len(B), len(N) X = np.vstack( [M, C, R_timbre, R_chroma, B, B / dur, N, N / float(chroma.shape[0])]) #plt.imshow(X, interpolation="nearest", aspect="auto"); plt.show() return X, dur