def add(self, name, parameters=dict()): if isinstance(name, processor): self._info.append({ 'name': name.__class__.__name__, 'parameters': name.__dict__ }) self._chain.append(name) elif isinstance(name, dict): assert 'chain' in name, 'Specify a chain in your configuration.' self.add(name['chain']) elif isinstance(name, list): for item in name: self.add(**item) else: if name is not 'none': # get processor class if isinstance(name, str): module = safe_import_module( 'dabstract.dataprocessor.processors') if not hasattr(module, name): # check customs module = safe_import_module( os.environ['dabstract_CUSTOM_DIR'] + '.processors') assert hasattr( module, name ), 'Processor is not supported in both dabstract.dataprocessor.processors and dabstract.custom.processors. Please check' processor_class = getattr(module, name) else: processor_class = name # initialize self._info.append({'name': name, 'parameters': parameters}) self._chain.append(processor_class(**parameters)) return self
def add( self, name: Union[Processor, Dict, List, str], parameters: Dict = dict() ) -> tvProcessingChain: """Add to chain""" # Add new processor if isinstance(name, Processor): # if it's a dabstract processor, directly use self._info.append({ "name": name.__class__.__name__, "parameters": name.__dict__ }) self._chain.append(name) elif isinstance(name, dict): # if a dictionary, check if it matches the expected format assert "chain" in name, "Specify a chain in your configuration." self.add(name["chain"]) elif isinstance(name, list): # if a list, iterate over it for item in name: self.add(**item) elif isinstance(name, str): # if str, search for processor in dabstract processors if name not in ("none", "None"): module = safe_import_module( "dabstract.dataprocessor.processors") if not hasattr(module, name): # check customs module = safe_import_module( os.environ["dabstract_CUSTOM_DIR"] + ".processors") assert hasattr( module, name ), "Processor is not supported in both dabstract.dataprocessor.processors and dabstract.custom.processors. Please check" self.add(getattr(module, name)(**parameters)) elif callable(name): if isinstance(name, type): # if it is a class to be initialised self.add(name(**parameters)) else: # if it is some function which does y = f(x), wrap it in a dabstract processor self.add(ExternalProcessor(name, **parameters)) elif name is None: # add None pass else: raise NotImplementedError( "Input that you provided does not work for ProcessingChain().") return self
def add_select(self, name, parameters=dict(), *arg, **kwargs): """Add a selection to the dataset This function add a selector to the dataset. The input to this function can either be a function that does the selection or a name/parameter pair that is used to search for that function in dabstract.dataset.select AND in the specified os.environ["dabstract_CUSTOM_DIR"]. When defining custom selector functions, one can either provide this function directly OR place them in os.environ["dabstract_CUSTOM_DIR"] / dataset / select.py. Any usage for custom function uses the same directory structure as dabstract. Besides a function one can also directly provide indices. Example: $ self.add_select(name, parameters=dict(), *arg, **kwargs) dabstract already has a set of build-in selectors in dabstract.dataset.select such that one can simply do: $ self.add_select(random_subsample, parameters=dict('ratio': 0.5)) for random subsampling. And $ self.add_select(subsample_by_str, parameters=dict('key': ..., 'keep': ...)) for selecting based on a key and a particular value One can also use lambda function such as: $ self.add_select((lambda x,k: x['data']['subdb'][k])) Or directly Use indices: $ indices = np.array[0,1,2,3,4]) $ self.add_select(indices) Arguments: name (function/str/indices): selector defined as a str (translated to fct internally) or function or indices parameters: additional parameters in case name is a str to init the function/class arg/kwargs: additional param to provide to the function if needed """ # get fct if isinstance(name,dict): if 'parameters' in name: parameters = name['parameters'] if 'name' in name: name = name['name'] if isinstance(name, str): module = selectm if not hasattr(module, name): module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] + '.dataset.select') assert hasattr(module, selectm), "Select " + selectm + " is not supported in both dabstract and custom xvals. Please check" func = getattr(module, name)(**parameters) # elif isinstance(name, (type, types.ClassType)): elif isinstance(name, type): func = name(**parameters) else: #if isinstance(name, (type, types.FunctionType)): func = name # apply selection orig_data = copy.deepcopy(self._data) self._data = DictSeqAbstract() for key in orig_data.keys(): self[key] = SelectAbstract(orig_data[key], func, *arg, eval_data=orig_data, **kwargs)
def set_xval(self, name, parameters = dict(), save_dir=None, overwrite=True): """Set the cross-validation folds This function sets the crossvalidation folds. This works similar as with self.add_select(). You can either provide a name/parameters pair where name is a string that refers to a particular function available in either dabstract.dataset.xval OR os.environ["dabstract_CUSTOM_DIR"] / dataset / xval.py. The former is a build-in xval while the latter offers you to add a custom function, which might be added to dabstract later on if validated. An other option is to provide the function directly through 'name'. Finally, it also offers to save your xval configuration such that it's identical to last experiment OR depending on where you save, use the same xval for different experiments. Example: $ self.set_xval(self, name, parameters = dict(), save_dir=None, overwrite=True) dabstract already has a set of build-in selectors in dabstract.dataset.xval such that one can simply do: $ self.set_xval(group_random_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group')) for random crossvalidation with a group constraint, and, $ self.set_xval(sequential_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group')) for sequential crossvalidation with a group constraint, and, $ self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3)) for stratified crossvalidation, and, $ self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3)) for random crossvalidation Arguments: name (function/str/indices): xval defined as a str (translated to fct internally) or function parameters: additional parameters in case name is a str to init the function/class save_dir (str): filepath to where to pickle the xval folds overwrite (bool): overwrite the saved file """ assert name is not None test_only = np.array([k for k in self['test_only']]) sel_vect_train = np.where((test_only == 0) & (test_only != -1))[0] sel_vect_test = np.where((test_only == 1) & (test_only != -1))[0] self_train = DataAbstract(SelectAbstract(self._data,sel_vect_train)) # checks get_xval = True if save_dir is not None: savefile_xval = os.path.join(save_dir, 'xval.pickle') if os.path.isfile(savefile_xval): get_xval = False # get if get_xval | overwrite: # get xval class if isinstance(name, str): module = xval if not hasattr(module, name): module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] + '.dataset.xval') assert hasattr(module,name), "Xval " + name + " is not supported in both dabstract and custom xvals. Please check" func = getattr(module, name)(**parameters) #elif isinstance(name, (type, types.ClassType)): elif isinstance(name, type): func = name(**parameters) elif isinstance(name,(type,types.FunctionType)): func = name self.xval = func(self_train) assert 'test' in self.xval, "please return a dict with minimally a test key" if save_dir is not None: os.makedirs(os.path.split(savefile_xval)[0], exist_ok=True) with open(savefile_xval, 'wb') as f: pickle.dump(self.xval, f) elif save_dir is not None: with open(savefile_xval, "rb") as f: self.xval = pickle.load(f) # load # sanity check keys = list(self.xval.keys()) for key in keys: assert isinstance(self.xval[key], list), 'Crossvalidation indices should be formatted in a list (for each fold).' assert len(self.xval[keys[0]]) == len(self.xval[key]), 'Amount of folds (items in list) should be the same for each test phase (train/val/test).' # add other test data for k in range(len(self.xval['test'])): self.xval['test'][k] = np.append(self.xval['test'][k], sel_vect_test) # add info self.xval['folds'] = len(self.xval['train']) return self.xval
def dataset_factory(name: (str, tvDataset, type) = None, paths: Dict[str, str] = None, xval: Optional[Dict[str, Union[str, int, Dict]]] = None, split: Optional[Dict[str, Union[str, int, Dict]]] = None, select: Optional[Dict[str, Union[str, int, Dict]]] = None, test_only: Optional[bool] = 0, **kwargs) -> tvDataset: """Dataset factory This function creates a dataset class from name and parameters. Specifically, this is used to search by name for that particular database class in - environment variable folder: os.environ["dabstract_CUSTOM_DIR"] = your_dir - dabstract.dataset.dbs folder If name is defined as a class object, than it uses this to init the dataset with the given kwargs. This function is mostly used by dataset_from_config(). One is advised to directly import the desired dataset class instead of using dataset_factory. This is only handy for configuration based experiments, which need a load from string. For example:: $ data = dataset_factory(name='DCASE2020Task1B', $ paths={'data': path_to_data, $ 'meta': path_to_meta, $ 'feat': path_to_feat}, One is advised to check the examples in dabstract/examples/introduction on how to work with datasets Parameters ---------- select: Dict[str,Union[str,int, Dict]] selector configuration split: Dict[str,Union[str,int, Dict]] split configuration xval : Dict[str,Union[str,int, Dict]] xval configuration test_only : bool use the dataset for test (test_only=1) or both train and test (test_only=0) name : str/instance/object name of the class (or the class directly) paths : dict[str] dictionary containing paths to the data kwargs: ToDo, not defined as this should be used only by load_from_config() Returns ------- dataset : Dataset class """ from dabstract.dataset.dataset import Dataset # get dataset if isinstance(name, str): # get db class module = dbs if not hasattr(module, name): # check customs module = safe_import_module(os.environ["dabstract_CUSTOM_DIR"] + ".dataset.dbs") assert hasattr(module, name), ( "Database class is not supported in both dabstract.dataset.dbs " + os.environ["dabstract_CUSTOM_DIR"] + ".dataset.dbs. Please check") db = getattr(module, name)(paths=paths, test_only=test_only, **kwargs) elif isinstance(name, Dataset): db = name elif isinstance(name, type): try: db = name(paths=paths, test_only=test_only, **kwargs) except: raise ValueError("Class is not a Dataset.") # add other functionality if split is not None: if isinstance(split, (int, float)): db.add_split(split) elif isinstance(split, dict): db.add_split(**split) else: raise NotImplementedError if select is not None: if isinstance(select, list): for _select in select: db.add_select(_select) if xval is not None: db.set_xval(**xval) return db
def dataset_factory(name=None, paths=None, xval=None, select=None, test_only=0, tmp_folder=None, **kwargs): """Dataset factory This function creates a dataset class from name and parameters. Specifically, this is used to search by name for that particular database class in - environment variable folder: os.environ["dabstract_CUSTOM_DIR"] = your_dir - dabstract.dataset.dbs folder if name is defined as a class object, than it uses this to init the dataset with the given kwargs. This function is mostly used by dataset_from_config(). One is advised to directly import the desired dataset class instead of using dataset_factory. This is only handy for configuration based experiments, which need a load from string. Example: $ data = dataset_factory(name='DCASE2020Task1B', $ paths={'data': path_to_data, $ 'meta': path_to_meta, $ 'feat': path_to_feat}, One is advised to check the examples in dabstract/examples/introduction on how to work with datasets Arguments: name (str/class): name of the class (or the class directly) kwargs: ToDo, not defined as this should be used only by load_from_config() Returns: dataset class """ # get dataset if isinstance(name, str): # get db class module = dbs if not hasattr(module, name): # check customs module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] + '.dataset.dbs') assert hasattr( module, name ), 'Database class is not supported in both dabstract.dataset.dbs ' + os.environ[ 'dabstract_CUSTOM_DIR'] + '.dataset.dbs. Please check' return getattr(module, name)(paths=paths, select=select, test_only=test_only, xval=xval, tmp_folder=tmp_folder, **kwargs) elif isinstance(name, DictSeqAbstract): pass elif isinstance(name, (type, types.ClassType)): return name(paths=paths, select=select, test_only=test_only, xval=xval, tmp_folder=tmp_folder, **kwargs)
def set_xval( self, name: Union[str, types.FunctionType, List[int], np.ndarray], parameters: Dict = dict(), save_path: str = None, overwrite: bool = True, ) -> None: """Set the cross-validation folds This function sets the crossvalidation folds. This works similar as with self.add_select(). You can either provide a name/parameters pair where name is a string that refers to a particular function available in either dabstract.dataset.xval OR os.environ["dabstract_CUSTOM_DIR"] / dataset / xval.py. The former is a build-in xval while the latter offers you to add a custom function, which might be added to dabstract later on if validated. An other option is to provide the function directly through 'name'. Finally, it also offers to save your xval configuration such that it's identical to last experiment OR depending on where you save, use the same xval for different experiments. dabstract already has a set of build-in selectors in dabstract.dataset.xval such that one can simply do:: $ self.set_xval(group_random_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group')) for random crossvalidation with a group constraint, and:: $ self.set_xval(sequential_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group')) for sequential crossvalidation with a group constraint, and:: $ self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3)) for stratified crossvalidation, and:: $ self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3)) for random crossvalidation. Parameters ---------- name : Callable/xval_func/str/List[int],np.ndarray xval defined as a str (translated to fct internally) or function parameters : dict additional parameters in case name is a str to init the function/class save_dir : str filepath to where to pickle the xval folds overwrite : bool overwrite the saved file """ assert name is not None test_only = np.array([k for k in self["test_only"]]) sel_vect_train = np.where(test_only == 0)[0] sel_vect_test = np.where(test_only == 1)[0] self_train = Select(self._data, sel_vect_train) # checks get_xval = True if save_path is not None: savefile_xval = os.path.join(save_path, "xval.pickle") if os.path.isfile(savefile_xval): get_xval = False # get if get_xval | overwrite: # get xval class if isinstance(name, str): module = xval if not hasattr(module, name): module = safe_import_module( os.environ["dabstract_CUSTOM_DIR"] + ".dataset.xval" ) assert hasattr(module, name), ( "Xval " + name + " is not supported in both dabstract and custom xvals. Please check" ) func = getattr(module, name)(**parameters) elif isinstance(name, type): func = name(**parameters) elif isinstance(name, types.FunctionType): func = name self.xval = func(self_train) assert "test" in self.xval, "please return a dict with minimally a test key" if save_path is not None: os.makedirs(os.path.split(savefile_xval)[0], exist_ok=True) with open(savefile_xval, "wb") as f: pickle.dump(self.xval, f) elif save_path is not None: with open(savefile_xval, "rb") as f: self.xval = pickle.load(f) # load # sanity check keys = list(self.xval.keys()) for key in keys: assert isinstance( self.xval[key], list ), "Crossvalidation indices should be formatted in a list (for each fold)." assert len(self.xval[keys[0]]) == len( self.xval[key] ), "Amount of folds (items in list) should be the same for each test phase (train/val/test)." # update indices based on sel_vect_train for key in self.xval: for k in range(len(self.xval[key])): self.xval[key][k] = sel_vect_train[self.xval[key][k]] # add other test data for k in range(len(self.xval["test"])): self.xval["test"][k] = np.append(self.xval["test"][k], sel_vect_test) # add info self.xval["folds"] = len(self.xval["test"]) return self.xval
def add_select( self, selector: Any, *arg, parameters: Optional[dict] = dict, eval_data: Any = None, **kwargs ) -> None: """Add a selection to the dataset This function add a selector to the dataset. The input to this function can either be a function that does the selection or a name/parameter pair that is used to search for that function in dabstract.dataset.select AND in the specified os.environ["dabstract_CUSTOM_DIR"]. When defining custom selector functions, one can either provide this function directly OR place them in os.environ["dabstract_CUSTOM_DIR"] / dataset / select.py. Any usage for custom function uses the same directory structure as dabstract. Besides a function one can also directly provide indices. dabstract already has a set of build-in selectors in dabstract.dataset.select such that one can simply do:: $ self.add_select(random_subsample, parameters=dict('ratio': 0.5)) for random subsampling, and:: $ self.add_select(subsample_by_str, parameters=dict('key': ..., 'keep': ...)) for selecting based on a key and a particular value One can also also use the lambda function such as:: $ self.add_select((lambda x,k: x['data']['subdb'][k])) Or directly use indices such as:: $ indices = np.array[0,1,2,3,4]) $ self.add_select(indices) Parameters ---------- selector : Callable/str/List[int]/np.ndarray selector defined as a str (translated to fct internally) or function or indices parameters : dict additional parameters in case name is a str to init the function/class eval_data : Any data which could be used to available selector on in case no indices but a function is used. Note that if no eval_data is selected it simply assumes the dataset itself to evaluate on. arg/kwargs: additional param to provide to the function if needed """ # get selector if isinstance(selector, dict): if "parameters" in selector: parameters = selector["parameters"] assert "name" in selector selector = selector["name"] if isinstance(selector, str): module = selectm if not hasattr(module, selector): module = safe_import_module( os.environ["dabstract_CUSTOM_DIR"] + ".dataset.select" ) assert hasattr(module, selectm), ( "Select " + selectm + " is not supported in both dabstract and custom xvals. Please check" ) selector = getattr(module, selector)(**parameters) elif isinstance(selector, type): selector = selector(**parameters) # apply selection self._data.add_select(selector, *arg, eval_data=eval_data, **kwargs)