def run_algorithms(audio_file, boundaries_id, labels_id, config, annotator_id=0): """Runs the algorithms with the specified identifiers on the audio_file. Parameters ---------- audio_file: str Path to the audio file to segment. boundaries_id: str Identifier of the boundaries algorithm to use ("gt" for ground truth). labels_id: str Identifier of the labels algorithm to use (None for not labeling). config: dict Dictionary containing the custom parameters of the algorithms to use. annotator_id: int Annotator identificator in the ground truth. Returns ------- est_times: np.array or list List of estimated times for the segment boundaries. If `list`, it will be a list of np.arrays, sorted by segmentation layer. est_labels: np.array or list List of all the labels associated segments. If `list`, it will be a list of np.arrays, sorted by segmentation layer. """ # Features should have already been computed, let's read them features = io.get_features(audio_file, config["annot_beats"], config["framesync"]) config["features"] = features # Check that there are enough audio frames if features["hpcp"].shape[0] <= msaf.minimum__frames: logging.warning("Audio file too short, or too many few beats " "estimated. Returning empty estimations.") return np.asarray([0, features["anal"]["dur"]]), \ np.asarray([0], dtype=int) # Get the corresponding modules bounds_module = get_boundaries_module(boundaries_id) labels_module = get_labels_module(labels_id) # Get the correct frame times frame_times = features["beats"] if config["framesync"]: frame_times = utils.get_time_frames(features["anal"]["dur"], features["anal"]) # Segment audio based on type of segmentation run_fun = run_hierarchical if config["hier"] else run_flat est_times, est_labels = run_fun(audio_file, bounds_module, labels_module, frame_times, config, annotator_id) return est_times, est_labels
def _preprocess(self, valid_features=["hpcp", "tonnetz", "mfcc"], normalize=True): """This method obtains the actual features, their frame times, and the boundary indeces in these features if needed.""" # Read features if self.features is None: # Features stored in a json file self.hpcp, self.mfcc, self.tonnetz, beats, dur, anal = \ io.get_features(self.audio_file, annot_beats=self.annot_beats, framesync=self.framesync) else: # Features passed as parameters feat_prefix = "" if not self.framesync: feat_prefix = "bs_" self.hpcp = self.features["%shpcp" % feat_prefix] self.mfcc = self.features["%smfcc" % feat_prefix] self.tonnetz = self.features["%stonnetz" % feat_prefix] beats = self.features["beats"] dur = self.features["anal"]["dur"] anal = self.features["anal"] # Store analysis parameters self.anal = anal # Use correct frames to find times frame_times = beats if self.framesync: frame_times = U.get_time_frames(dur, anal) # Read input bounds if necessary bound_idxs = None if self.in_bound_times is not None: bound_idxs = io.align_times(self.in_bound_times, frame_times) bound_idxs = np.unique(bound_idxs) # Use specific feature if self.feature_str not in valid_features: raise RuntimeError("Feature %s in not valid for algorithm: %s " "(valid features are %s)." % (self.feature_str, __name__, valid_features)) else: try: F = eval("self." + self.feature_str) except: raise RuntimeError("Feature %s in not supported by MSAF" % (self.feature_str)) # Normalize if needed if normalize: F = U.lognormalize_chroma(F) return F, frame_times, dur, bound_idxs
def run_algorithms(audio_file, boundaries_id, labels_id, config, annotator_id=0): """Runs the algorithms with the specified identifiers on the audio_file. Parameters ---------- audio_file: str Path to the audio file to segment. boundaries_id: str Identifier of the boundaries algorithm to use ("gt" for ground truth). labels_id: str Identifier of the labels algorithm to use (None for not labeling). config: dict Dictionary containing the custom parameters of the algorithms to use. annotator_id: int Annotator identificator in the ground truth. Returns ------- est_times: np.array or list List of estimated times for the segment boundaries. If `list`, it will be a list of np.arrays, sorted by segmentation layer. est_labels: np.array or list List of all the labels associated segments. If `list`, it will be a list of np.arrays, sorted by segmentation layer. """ # At this point, features should have already been computed hpcp, mfcc, tonnetz, cqt, gmt, beats, dur, anal = \ io.get_features(audio_file, config["annot_beats"], config["framesync"], pre_features=config["features"]) # Check that there are enough audio frames if hpcp.shape[0] <= msaf.minimum__frames: logging.warning("Audio file too short, or too many few beats " "estimated. Returning empty estimations.") return np.asarray([0, dur]), np.asarray([0], dtype=int) # Get the corresponding modules bounds_module = get_boundaries_module(boundaries_id) labels_module = get_labels_module(labels_id) # Get the correct frame times frame_times = beats if config["framesync"]: frame_times = utils.get_time_frames(dur, anal) # Segment audio based on type of segmentation if config["hier"]: # Hierarchical segmentation if bounds_module is None: raise RuntimeError("A boundary algorithm is needed when using " "hierarchical segmentation.") if labels_module is not None and \ bounds_module.__name__ != labels_module.__name__: raise RuntimeError("The same algorithm for boundaries and labels is " "needed when using hierarchical segmentation.") S = bounds_module.Segmenter(audio_file, **config) est_idxs, est_labels = S.processHierarchical() # Make sure the first and last boundaries are included for each # level in the hierarchy est_times = [] cleaned_est_labels = [] for level in range(len(est_idxs)): est_level_times, est_level_labels = \ utils.process_segmentation_level(est_idxs[level], est_labels[level], hpcp.shape[0], frame_times, dur) est_times.append(est_level_times) cleaned_est_labels.append(est_level_labels) est_labels = cleaned_est_labels else: # Flat segmentation # Segment using the specified boundaries and labels # Case when boundaries and labels algorithms are the same if bounds_module is not None and labels_module is not None and \ bounds_module.__name__ == labels_module.__name__: S = bounds_module.Segmenter(audio_file, **config) est_idxs, est_labels = S.processFlat() # Different boundary and label algorithms else: # Identify segment boundaries if bounds_module is not None: S = bounds_module.Segmenter(audio_file, in_labels=[], **config) est_idxs, est_labels = S.processFlat() else: try: est_times, est_labels = io.read_references( audio_file, annotator_id=annotator_id) est_idxs = io.align_times(est_times, frame_times[:-1]) if est_idxs[0] != 0: est_idxs = np.concatenate(([0], est_idxs)) if est_idxs[-1] != hpcp.shape[0] - 1: est_idxs = np.concatenate((est_idxs, [hpcp.shape[0] - 1])) except: logging.warning("No references found for file: %s" % audio_file) return [], [] # Label segments if labels_module is not None: if len(est_idxs) == 2: est_labels = np.array([0]) else: S = labels_module.Segmenter(audio_file, in_bound_idxs=est_idxs, **config) est_labels = S.processFlat()[1] # Make sure the first and last boundaries are included est_times, est_labels = utils.process_segmentation_level( est_idxs, est_labels, hpcp.shape[0], frame_times, dur) return est_times, est_labels
def get_features(audio_path, annot_beats=False, framesync=False): """ Gets the features of an audio file given the audio_path. Parameters ---------- audio_path: str Path to the audio file. annot_beats: bool Whether to use annotated beats or not. framesync: bool Whether to use framesync features or not. Return ------ features : dict A dictionary with the following keys: "hpcp": np.array((N, 12)), Chromagram "mfcc": np.array((N, 13)), MFCC "tonnetz": np.array((N, 6)), Tonnetz "cqt": np.array((N, msaf.Anal.cqt_bins)), Constant-Q transform "beats": np.array(T), Beats in seconds "anal" : dict, Parameters of analysis of track (e.g. sampling rate) """ # Dataset path ds_path = os.path.dirname(os.path.dirname(audio_path)) # Read Estimations features_path = os.path.join( ds_path, msaf.Dataset.features_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.features_ext ) with open(features_path, "r") as f: feats = json.load(f) # Beat Synchronous Feats if framesync: feat_str = "framesync" beats = None else: if annot_beats: # Read references try: annotation_path = os.path.join( ds_path, msaf.Dataset.references_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.references_ext, ) # TODO: Better exception handling jam = jams.load(annotation_path, validate=False) except: raise RuntimeError("No references found in file %s" % annotation_path) feat_str = "ann_beatsync" beats = [] beat_data = jam.beats[0].data if beat_data == []: raise ValueError for data in beat_data: beats.append(data.time.value) beats = np.unique(beats) else: feat_str = "est_beatsync" beats = np.asarray(feats["beats"]["times"]) # Build actual features dictionary features = {} features["hpcp"] = np.asarray(feats[feat_str]["hpcp"]) features["mfcc"] = np.asarray(feats[feat_str]["mfcc"]) features["tonnetz"] = np.asarray(feats[feat_str]["tonnetz"]) features["cqt"] = np.asarray(feats[feat_str]["cqt"]) features["beats"] = np.asarray(feats["beats"]["times"]) features["anal"] = feats["analysis"] # Frame times might be shorter than the actual number of features. if framesync: frame_times = utils.get_time_frames(features["anal"]["dur"], features["anal"]) features["hpcp"] = features["hpcp"][: len(frame_times)] features["mfcc"] = features["mfcc"][: len(frame_times)] features["tonnetz"] = features["tonnetz"][: len(frame_times)] features["cqt"] = features["cqt"][: len(frame_times)] return features
def get_features(audio_path, annot_beats=False, framesync=False, pre_features=None): """ Gets the features of an audio file given the audio_path. Parameters ---------- audio_path: str Path to the audio file. annot_beats: bool Whether to use annotated beats or not. framesync: bool Whether to use framesync features or not. pre_features: dict Pre computed features as a dictionary. `None` for reading them form the json file. Return ------ C: np.array((N, 12)) (Beat-sync) Chromagram M: np.array((N, 13)) (Beat-sync) MFCC T: np.array((N, 6)) (Beat-sync) Tonnetz cqt: np.array((N, msaf.Anal.cqt_bins)) (Beat-sync) Constant-Q transform beats: np.array(T) Beats in seconds dur: float Song duration analysis : dict Parameters of analysis of track (e.g. sampling rate) """ if pre_features is None: # Dataset path ds_path = os.path.dirname(os.path.dirname(audio_path)) # Read Estimations features_path = os.path.join( ds_path, msaf.Dataset.features_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.features_ext) with open(features_path, "r") as f: feats = json.load(f) # Beat Synchronous Feats if framesync: feat_str = "framesync" beats = None else: if annot_beats: # Read references try: annotation_path = os.path.join( ds_path, msaf.Dataset.references_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.references_ext) jam = jams2.load(annotation_path) except: raise RuntimeError("No references found in file %s" % annotation_path) feat_str = "ann_beatsync" beats = [] beat_data = jam.beats[0].data if beat_data == []: raise ValueError for data in beat_data: beats.append(data.time.value) beats = np.unique(beats) else: feat_str = "est_beatsync" beats = np.asarray(feats["beats"]["times"]) C = np.asarray(feats[feat_str]["hpcp"]) M = np.asarray(feats[feat_str]["mfcc"]) T = np.asarray(feats[feat_str]["tonnetz"]) cqt = np.asarray(feats[feat_str]["cqt"]) '''Mi: added the Gammatone features''' G = np.asarray(feats[feat_str]["gmt"]) analysis = feats["analysis"] dur = analysis["dur"] # Frame times might be shorter than the actual number of features. if framesync: frame_times = utils.get_time_frames(dur, analysis) C = C[:len(frame_times)] M = M[:len(frame_times)] T = T[:len(frame_times)] G = G[:len(frame_times)] else: feat_prefix = "" if not framesync: feat_prefix = "bs_" C = pre_features["%shpcp" % feat_prefix] M = pre_features["%smfcc" % feat_prefix] T = pre_features["%stonnetz" % feat_prefix] cqt = pre_features["%scqt" % feat_prefix] G = pre_features["%sgmt" % feat_prefix] beats = pre_features["beats"] dur = pre_features["anal"]["dur"] analysis = pre_features["anal"] return C, M, T, cqt, G, beats, dur, analysis
def get_features(audio_path, annot_beats=False, framesync=False, pre_features=None): """ Gets the features of an audio file given the audio_path. Parameters ---------- audio_path: str Path to the audio file. annot_beats: bool Whether to use annotated beats or not. framesync: bool Whether to use framesync features or not. pre_features: dict Pre computed features as a dictionary. `None` for reading them form the json file. Return ------ C: np.array((N, 12)) (Beat-sync) Chromagram M: np.array((N, 13)) (Beat-sync) MFCC T: np.array((N, 6)) (Beat-sync) Tonnetz cqt: np.array((N, msaf.Anal.cqt_bins)) (Beat-sync) Constant-Q transform beats: np.array(T) Beats in seconds dur: float Song duration analysis : dict Parameters of analysis of track (e.g. sampling rate) """ if pre_features is None: # Dataset path ds_path = os.path.dirname(os.path.dirname(audio_path)) # Read Estimations features_path = os.path.join(ds_path, msaf.Dataset.features_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.features_ext) with open(features_path, "r") as f: feats = json.load(f) # Beat Synchronous Feats if framesync: feat_str = "framesync" beats = None else: if annot_beats: # Read references try: annotation_path = os.path.join( ds_path, msaf.Dataset.references_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.references_ext) jam = jams2.load(annotation_path) except: raise RuntimeError("No references found in file %s" % annotation_path) feat_str = "ann_beatsync" beats = [] beat_data = jam.beats[0].data if beat_data == []: raise ValueError for data in beat_data: beats.append(data.time.value) beats = np.unique(beats) else: feat_str = "est_beatsync" beats = np.asarray(feats["beats"]["times"]) C = np.asarray(feats[feat_str]["hpcp"]) M = np.asarray(feats[feat_str]["mfcc"]) T = np.asarray(feats[feat_str]["tonnetz"]) cqt = np.asarray(feats[feat_str]["cqt"]) '''Mi: added the Gammatone features''' G = np.asarray(feats[feat_str]["gmt"]) analysis = feats["analysis"] dur = analysis["dur"] # Frame times might be shorter than the actual number of features. if framesync: frame_times = utils.get_time_frames(dur, analysis) C = C[:len(frame_times)] M = M[:len(frame_times)] T = T[:len(frame_times)] G = G[:len(frame_times)] else: feat_prefix = "" if not framesync: feat_prefix = "bs_" C = pre_features["%shpcp" % feat_prefix] M = pre_features["%smfcc" % feat_prefix] T = pre_features["%stonnetz" % feat_prefix] cqt = pre_features["%scqt" % feat_prefix] G = pre_features["%sgmt" % feat_prefix] beats = pre_features["beats"] dur = pre_features["anal"]["dur"] analysis = pre_features["anal"] return C, M, T, cqt, G, beats, dur, analysis
def get_features(audio_path, annot_beats=False, framesync=False): """ Gets the features of an audio file given the audio_path. Parameters ---------- audio_path: str Path to the audio file. annot_beats: bool Whether to use annotated beats or not. framesync: bool Whether to use framesync features or not. Return ------ features : dict A dictionary with the following keys: "hpcp": np.array((N, 12)), Chromagram "mfcc": np.array((N, 13)), MFCC "tonnetz": np.array((N, 6)), Tonnetz "cqt": np.array((N, msaf.Anal.cqt_bins)), Constant-Q transform "beats": np.array(T), Beats in seconds "anal" : dict, Parameters of analysis of track (e.g. sampling rate) """ # Dataset path ds_path = os.path.dirname(os.path.dirname(audio_path)) # Read Estimations features_path = os.path.join( ds_path, msaf.Dataset.features_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.features_ext) with open(features_path, "r") as f: feats = json.load(f) # Beat Synchronous Feats if framesync: feat_str = "framesync" beats = None else: if annot_beats: # Read references try: annotation_path = os.path.join( ds_path, msaf.Dataset.references_dir, os.path.basename(audio_path)[:-4] + msaf.Dataset.references_ext) # TODO: Better exception handling jam = jams.load(annotation_path, validate=False) except: raise RuntimeError("No references found in file %s" % annotation_path) feat_str = "ann_beatsync" beats = [] beat_data = jam.beats[0].data if beat_data == []: raise ValueError for data in beat_data: beats.append(data.time.value) beats = np.unique(beats) else: feat_str = "est_beatsync" beats = np.asarray(feats["beats"]["times"]) # Build actual features dictionary features = {} features["hpcp"] = np.asarray(feats[feat_str]["hpcp"]) features["mfcc"] = np.asarray(feats[feat_str]["mfcc"]) features["tonnetz"] = np.asarray(feats[feat_str]["tonnetz"]) features["cqt"] = np.asarray(feats[feat_str]["cqt"]) features["beats"] = np.asarray(feats["beats"]["times"]) features["anal"] = feats["analysis"] # Frame times might be shorter than the actual number of features. if framesync: frame_times = utils.get_time_frames(features["anal"]["dur"], features["anal"]) features["hpcp"] = features["hpcp"][:len(frame_times)] features["mfcc"] = features["mfcc"][:len(frame_times)] features["tonnetz"] = features["tonnetz"][:len(frame_times)] features["cqt"] = features["cqt"][:len(frame_times)] return features