def add(self, name, parameters=dict()):
     if isinstance(name, processor):
         self._info.append({
             'name': name.__class__.__name__,
             'parameters': name.__dict__
         })
         self._chain.append(name)
     elif isinstance(name, dict):
         assert 'chain' in name, 'Specify a chain in your configuration.'
         self.add(name['chain'])
     elif isinstance(name, list):
         for item in name:
             self.add(**item)
     else:
         if name is not 'none':
             # get processor class
             if isinstance(name, str):
                 module = safe_import_module(
                     'dabstract.dataprocessor.processors')
                 if not hasattr(module, name):  # check customs
                     module = safe_import_module(
                         os.environ['dabstract_CUSTOM_DIR'] + '.processors')
                     assert hasattr(
                         module, name
                     ), 'Processor is not supported in both dabstract.dataprocessor.processors and dabstract.custom.processors. Please check'
                 processor_class = getattr(module, name)
             else:
                 processor_class = name
             # initialize
             self._info.append({'name': name, 'parameters': parameters})
             self._chain.append(processor_class(**parameters))
     return self
 def add(
     self,
     name: Union[Processor, Dict, List, str],
     parameters: Dict = dict()
 ) -> tvProcessingChain:
     """Add to chain"""
     # Add new processor
     if isinstance(name, Processor):
         # if it's a dabstract processor, directly use
         self._info.append({
             "name": name.__class__.__name__,
             "parameters": name.__dict__
         })
         self._chain.append(name)
     elif isinstance(name, dict):
         # if a dictionary, check if it matches the expected format
         assert "chain" in name, "Specify a chain in your configuration."
         self.add(name["chain"])
     elif isinstance(name, list):
         # if a list, iterate over it
         for item in name:
             self.add(**item)
     elif isinstance(name, str):
         # if str, search for processor in dabstract processors
         if name not in ("none", "None"):
             module = safe_import_module(
                 "dabstract.dataprocessor.processors")
             if not hasattr(module, name):  # check customs
                 module = safe_import_module(
                     os.environ["dabstract_CUSTOM_DIR"] + ".processors")
                 assert hasattr(
                     module, name
                 ), "Processor is not supported in both dabstract.dataprocessor.processors and dabstract.custom.processors. Please check"
             self.add(getattr(module, name)(**parameters))
     elif callable(name):
         if isinstance(name, type):
             # if it is a class to be initialised
             self.add(name(**parameters))
         else:
             # if it is some function which does y = f(x), wrap it in a dabstract processor
             self.add(ExternalProcessor(name, **parameters))
     elif name is None:
         # add None
         pass
     else:
         raise NotImplementedError(
             "Input that you provided does not work for ProcessingChain().")
     return self
    def add_select(self, name, parameters=dict(), *arg, **kwargs):
        """Add a selection to the dataset

        This function add a selector to the dataset. The input to this function can either be a function that does the
        selection or a name/parameter pair that is used to search for that function in dabstract.dataset.select
        AND in the specified os.environ["dabstract_CUSTOM_DIR"]. When defining custom selector functions, one can either provide
        this function directly OR place them in os.environ["dabstract_CUSTOM_DIR"] / dataset / select.py.
        Any usage for custom function uses the same directory structure as dabstract.

        Besides a function one can also directly provide indices.

        Example:
            $  self.add_select(name, parameters=dict(), *arg, **kwargs)
            dabstract already has a set of build-in selectors in dabstract.dataset.select such that one can simply do:
            $  self.add_select(random_subsample, parameters=dict('ratio': 0.5))
            for random subsampling. And
            $  self.add_select(subsample_by_str, parameters=dict('key': ..., 'keep': ...))
            for selecting based on a key and a particular value
            One can also use lambda function such as:
            $  self.add_select((lambda x,k: x['data']['subdb'][k]))
            Or directly Use indices:
            $  indices = np.array[0,1,2,3,4])
            $  self.add_select(indices)

        Arguments:
            name (function/str/indices): selector defined as a str (translated to fct internally) or function or indices
            parameters: additional parameters in case name is a str to init the function/class
            arg/kwargs: additional param to provide to the function if needed
        """

        # get fct
        if isinstance(name,dict):
            if 'parameters' in name:
                parameters = name['parameters']
            if 'name' in name:
                name = name['name']
        if isinstance(name, str):
            module = selectm
            if not hasattr(module, name):
                module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] + '.dataset.select')
                assert hasattr(module,
                               selectm), "Select " + selectm + " is not supported in both dabstract and custom xvals. Please check"
            func = getattr(module, name)(**parameters)
#        elif isinstance(name, (type, types.ClassType)):
        elif isinstance(name, type):
            func = name(**parameters)
        else: #if isinstance(name, (type, types.FunctionType)):
            func = name
        # apply selection
        orig_data = copy.deepcopy(self._data)
        self._data = DictSeqAbstract()
        for key in orig_data.keys():
            self[key] = SelectAbstract(orig_data[key], func,  *arg, eval_data=orig_data, **kwargs)
    def set_xval(self, name, parameters = dict(), save_dir=None, overwrite=True):
        """Set the cross-validation folds

        This function sets the crossvalidation folds. This works similar as with self.add_select().
        You can either provide a name/parameters pair where name is a string that refers to a particular function available
        in either dabstract.dataset.xval OR os.environ["dabstract_CUSTOM_DIR"] / dataset / xval.py. The former is a build-in
        xval while the latter offers you to add a custom function, which might be added to dabstract later on if validated.
        An other option is to provide the function directly through 'name'. Finally, it also offers
        to save your xval configuration such that it's identical to last experiment OR depending on where you save,
        use the same xval for different experiments.

        Example:
            $  self.set_xval(self, name, parameters = dict(), save_dir=None, overwrite=True)
            dabstract already has a set of build-in selectors in dabstract.dataset.xval such that one can simply do:
            $  self.set_xval(group_random_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group'))
            for random crossvalidation with a group constraint, and,
            $  self.set_xval(sequential_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group'))
            for sequential crossvalidation with a group constraint, and,
            $  self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3))
            for stratified crossvalidation, and,
            $  self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3))
            for random crossvalidation

        Arguments:
            name (function/str/indices): xval defined as a str (translated to fct internally) or function
            parameters: additional parameters in case name is a str to init the function/class
            save_dir (str): filepath to where to pickle the xval folds
            overwrite (bool): overwrite the saved file
        """

        assert name is not None
        test_only = np.array([k for k in self['test_only']])
        sel_vect_train = np.where((test_only == 0) & (test_only != -1))[0]
        sel_vect_test = np.where((test_only == 1) & (test_only != -1))[0]

        self_train = DataAbstract(SelectAbstract(self._data,sel_vect_train))

        # checks
        get_xval = True
        if save_dir is not None:
            savefile_xval = os.path.join(save_dir, 'xval.pickle')
            if os.path.isfile(savefile_xval):
                get_xval = False

        # get
        if get_xval | overwrite:
            # get xval class
            if isinstance(name, str):
                module = xval
                if not hasattr(module, name):
                    module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] + '.dataset.xval')
                    assert hasattr(module,name), "Xval " + name + " is not supported in both dabstract and custom xvals. Please check"
                func = getattr(module, name)(**parameters)
            #elif isinstance(name, (type, types.ClassType)):
            elif isinstance(name, type):
                func = name(**parameters)
            elif isinstance(name,(type,types.FunctionType)):
                func = name

            self.xval = func(self_train)
            assert 'test' in self.xval, "please return a dict with minimally a test key"

            if save_dir is not None:
                os.makedirs(os.path.split(savefile_xval)[0], exist_ok=True)
                with open(savefile_xval, 'wb') as f: pickle.dump(self.xval, f)
        elif save_dir is not None:
            with open(savefile_xval, "rb") as f: self.xval = pickle.load(f)  # load

        # sanity check
        keys = list(self.xval.keys())
        for key in keys:
            assert isinstance(self.xval[key], list), 'Crossvalidation indices should be formatted in a list (for each fold).'
            assert len(self.xval[keys[0]]) == len(self.xval[key]), 'Amount of folds (items in list) should be the same for each test phase (train/val/test).'

        # add other test data
        for k in range(len(self.xval['test'])):
            self.xval['test'][k] = np.append(self.xval['test'][k], sel_vect_test)

        # add info
        self.xval['folds'] = len(self.xval['train'])

        return self.xval
Exemple #5
0
def dataset_factory(name: (str, tvDataset, type) = None,
                    paths: Dict[str, str] = None,
                    xval: Optional[Dict[str, Union[str, int, Dict]]] = None,
                    split: Optional[Dict[str, Union[str, int, Dict]]] = None,
                    select: Optional[Dict[str, Union[str, int, Dict]]] = None,
                    test_only: Optional[bool] = 0,
                    **kwargs) -> tvDataset:
    """Dataset factory

    This function creates a dataset class from name and parameters.
    Specifically, this is used to search by name for that particular database class in
    - environment variable folder: os.environ["dabstract_CUSTOM_DIR"] = your_dir
    - dabstract.dataset.dbs folder

    If name is defined as a class object, than it uses this to init the dataset with the given kwargs.
    This function is mostly used by dataset_from_config(). One is advised to directly
    import the desired dataset class instead of using dataset_factory. This is only
    handy for configuration based experiments, which need a load from string.
    For example::
        $  data = dataset_factory(name='DCASE2020Task1B',
        $                         paths={'data': path_to_data,
        $                                'meta': path_to_meta,
        $                                'feat': path_to_feat},

    One is advised to check the examples in dabstract/examples/introduction on
    how to work with datasets

    Parameters
    ----------
    select: Dict[str,Union[str,int, Dict]]
        selector configuration
    split: Dict[str,Union[str,int, Dict]]
        split configuration
    xval : Dict[str,Union[str,int, Dict]]
        xval configuration
    test_only : bool
        use the dataset for test (test_only=1) or both train and test (test_only=0)
    name : str/instance/object
        name of the class (or the class directly)
    paths : dict[str]
        dictionary containing paths to the data
    kwargs: ToDo, not defined as this should be used only by load_from_config()

    Returns
    -------
    dataset : Dataset class
    """
    from dabstract.dataset.dataset import Dataset

    # get dataset
    if isinstance(name, str):
        # get db class
        module = dbs
        if not hasattr(module, name):  # check customs
            module = safe_import_module(os.environ["dabstract_CUSTOM_DIR"] +
                                        ".dataset.dbs")
            assert hasattr(module, name), (
                "Database class is not supported in both dabstract.dataset.dbs "
                + os.environ["dabstract_CUSTOM_DIR"] +
                ".dataset.dbs. Please check")
        db = getattr(module, name)(paths=paths, test_only=test_only, **kwargs)
    elif isinstance(name, Dataset):
        db = name
    elif isinstance(name, type):
        try:
            db = name(paths=paths, test_only=test_only, **kwargs)
        except:
            raise ValueError("Class is not a Dataset.")

    # add other functionality
    if split is not None:
        if isinstance(split, (int, float)):
            db.add_split(split)
        elif isinstance(split, dict):
            db.add_split(**split)
        else:
            raise NotImplementedError
    if select is not None:
        if isinstance(select, list):
            for _select in select:
                db.add_select(_select)
    if xval is not None:
        db.set_xval(**xval)

    return db
def dataset_factory(name=None,
                    paths=None,
                    xval=None,
                    select=None,
                    test_only=0,
                    tmp_folder=None,
                    **kwargs):
    """Dataset factory

    This function creates a dataset class from name and parameters.
    Specifically, this is used to search by name for that particular database class in
    - environment variable folder: os.environ["dabstract_CUSTOM_DIR"] = your_dir
    - dabstract.dataset.dbs folder

    if name is defined as a class object, than it uses this to init the dataset with the given kwargs.
    This function is mostly used by dataset_from_config(). One is advised to directly
    import the desired dataset class instead of using dataset_factory. This is only
    handy for configuration based experiments, which need a load from string.

    Example:
        $  data = dataset_factory(name='DCASE2020Task1B',
        $                         paths={'data': path_to_data,
        $                                'meta': path_to_meta,
        $                                'feat': path_to_feat},

        One is advised to check the examples in dabstract/examples/introduction on
        how to work with datasets

    Arguments:
        name (str/class): name of the class (or the class directly)
        kwargs: ToDo, not defined as this should be used only by load_from_config()
    Returns:
        dataset class
    """

    # get dataset
    if isinstance(name, str):
        # get db class
        module = dbs
        if not hasattr(module, name):  # check customs
            module = safe_import_module(os.environ['dabstract_CUSTOM_DIR'] +
                                        '.dataset.dbs')
            assert hasattr(
                module, name
            ), 'Database class is not supported in both dabstract.dataset.dbs ' + os.environ[
                'dabstract_CUSTOM_DIR'] + '.dataset.dbs. Please check'
        return getattr(module, name)(paths=paths,
                                     select=select,
                                     test_only=test_only,
                                     xval=xval,
                                     tmp_folder=tmp_folder,
                                     **kwargs)
    elif isinstance(name, DictSeqAbstract):
        pass
    elif isinstance(name, (type, types.ClassType)):
        return name(paths=paths,
                    select=select,
                    test_only=test_only,
                    xval=xval,
                    tmp_folder=tmp_folder,
                    **kwargs)
Exemple #7
0
    def set_xval(
        self,
        name: Union[str, types.FunctionType, List[int], np.ndarray],
        parameters: Dict = dict(),
        save_path: str = None,
        overwrite: bool = True,
    ) -> None:
        """Set the cross-validation folds

        This function sets the crossvalidation folds. This works similar as with self.add_select().
        You can either provide a name/parameters pair where name is a string that refers to a particular function available
        in either dabstract.dataset.xval OR os.environ["dabstract_CUSTOM_DIR"] / dataset / xval.py. The former is a build-in
        xval while the latter offers you to add a custom function, which might be added to dabstract later on if validated.
        An other option is to provide the function directly through 'name'. Finally, it also offers
        to save your xval configuration such that it's identical to last experiment OR depending on where you save,
        use the same xval for different experiments.

        dabstract already has a set of build-in selectors in dabstract.dataset.xval
        such that one can simply do::

            $  self.set_xval(group_random_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group'))

        for random crossvalidation with a group constraint, and::

            $  self.set_xval(sequential_kfold, parameters=dict('folds': 4, 'val_frac=1/3, group_key='group'))

        for sequential crossvalidation with a group constraint, and::

            $  self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3))

        for stratified crossvalidation, and::

            $  self.set_xval(stratified_kfold, parameters=dict('folds': 4, 'val_frac=1/3))

        for random crossvalidation.

        Parameters
        ----------
        name : Callable/xval_func/str/List[int],np.ndarray
            xval defined as a str (translated to fct internally) or function
        parameters : dict
            additional parameters in case name is a str to init the function/class
        save_dir : str
            filepath to where to pickle the xval folds
        overwrite : bool
            overwrite the saved file
        """

        assert name is not None
        test_only = np.array([k for k in self["test_only"]])
        sel_vect_train = np.where(test_only == 0)[0]
        sel_vect_test = np.where(test_only == 1)[0]

        self_train = Select(self._data, sel_vect_train)

        # checks
        get_xval = True
        if save_path is not None:
            savefile_xval = os.path.join(save_path, "xval.pickle")
            if os.path.isfile(savefile_xval):
                get_xval = False

        # get
        if get_xval | overwrite:
            # get xval class
            if isinstance(name, str):
                module = xval
                if not hasattr(module, name):
                    module = safe_import_module(
                        os.environ["dabstract_CUSTOM_DIR"] + ".dataset.xval"
                    )
                    assert hasattr(module, name), (
                        "Xval "
                        + name
                        + " is not supported in both dabstract and custom xvals. Please check"
                    )
                func = getattr(module, name)(**parameters)

            elif isinstance(name, type):
                func = name(**parameters)

            elif isinstance(name, types.FunctionType):
                func = name

            self.xval = func(self_train)
            assert "test" in self.xval, "please return a dict with minimally a test key"

            if save_path is not None:
                os.makedirs(os.path.split(savefile_xval)[0], exist_ok=True)
                with open(savefile_xval, "wb") as f:
                    pickle.dump(self.xval, f)
        elif save_path is not None:
            with open(savefile_xval, "rb") as f:
                self.xval = pickle.load(f)  # load

        # sanity check
        keys = list(self.xval.keys())
        for key in keys:
            assert isinstance(
                self.xval[key], list
            ), "Crossvalidation indices should be formatted in a list (for each fold)."
            assert len(self.xval[keys[0]]) == len(
                self.xval[key]
            ), "Amount of folds (items in list) should be the same for each test phase (train/val/test)."

        # update indices based on sel_vect_train
        for key in self.xval:
            for k in range(len(self.xval[key])):
                self.xval[key][k] = sel_vect_train[self.xval[key][k]]

        # add other test data
        for k in range(len(self.xval["test"])):
            self.xval["test"][k] = np.append(self.xval["test"][k], sel_vect_test)

        # add info
        self.xval["folds"] = len(self.xval["test"])

        return self.xval
Exemple #8
0
    def add_select(
        self,
        selector: Any,
        *arg,
        parameters: Optional[dict] = dict,
        eval_data: Any = None,
        **kwargs
    ) -> None:
        """Add a selection to the dataset

        This function add a selector to the dataset. The input to this function can either be a function that does the
        selection or a name/parameter pair that is used to search for that function in dabstract.dataset.select
        AND in the specified os.environ["dabstract_CUSTOM_DIR"]. When defining custom selector functions, one can either provide
        this function directly OR place them in os.environ["dabstract_CUSTOM_DIR"] / dataset / select.py.
        Any usage for custom function uses the same directory structure as dabstract.

        Besides a function one can also directly provide indices.

        dabstract already has a set of build-in selectors in dabstract.dataset.select such
        that one can simply do::

            $  self.add_select(random_subsample, parameters=dict('ratio': 0.5))

        for random subsampling, and::

            $  self.add_select(subsample_by_str, parameters=dict('key': ..., 'keep': ...))

        for selecting based on a key and a particular value
        One can also also use the lambda function such as::

            $  self.add_select((lambda x,k: x['data']['subdb'][k]))

        Or directly use indices such as::

            $  indices = np.array[0,1,2,3,4])
            $  self.add_select(indices)

        Parameters
        ----------
        selector : Callable/str/List[int]/np.ndarray
            selector defined as a str (translated to fct internally) or function or indices
        parameters : dict
            additional parameters in case name is a str to init the function/class
        eval_data : Any
            data which could be used to available selector on in case no indices but a function is used.
            Note that if no eval_data is selected it simply assumes the dataset itself to evaluate on.
        arg/kwargs:
            additional param to provide to the function if needed
        """

        # get selector
        if isinstance(selector, dict):
            if "parameters" in selector:
                parameters = selector["parameters"]
            assert "name" in selector
            selector = selector["name"]
        if isinstance(selector, str):
            module = selectm
            if not hasattr(module, selector):
                module = safe_import_module(
                    os.environ["dabstract_CUSTOM_DIR"] + ".dataset.select"
                )
                assert hasattr(module, selectm), (
                    "Select "
                    + selectm
                    + " is not supported in both dabstract and custom xvals. Please check"
                )
            selector = getattr(module, selector)(**parameters)
        elif isinstance(selector, type):
            selector = selector(**parameters)

        # apply selection
        self._data.add_select(selector, *arg, eval_data=eval_data, **kwargs)