Example #1
0
 def __init__(self, threshold, n_jobs=1, max_delete=1):
     self.max_delete = max_delete
     self.to_delete = []
     self.threshold = threshold
     self.n_jobs = n_jobs
     self._type = "DataFrame"
     self.logger = get_logger(self)
Example #2
0
    def init_data(self, random_state, data_manager: DataManager,
                  metric: Scorer, should_calc_all_metric: bool, splitter,
                  should_store_intermediate_result: bool,
                  resource_manager: ResourceManager):
        self.random_state = random_state
        if hasattr(splitter, "random_state"):
            setattr(splitter, "random_state", self.random_state)
        self.splitter = splitter
        self.data_manager = data_manager
        self.X_train = self.data_manager.X_train
        self.y_train = self.data_manager.y_train
        self.X_test = self.data_manager.X_test
        self.y_test = self.data_manager.y_test
        self.should_store_intermediate_result = should_store_intermediate_result
        self.metric = metric
        self.ml_task: MLTask = self.data_manager.ml_task

        self.should_calc_all_metric = should_calc_all_metric

        if self.ml_task.mainTask == "regression":
            self.predict_function = self._predict_regression
        else:
            self.predict_function = self._predict_proba

        self.logger = get_logger(self)
        self.resource_manager = resource_manager
Example #3
0
 def __init__(self):
     self.resource_manager = None
     self.estimator = None
     self.in_feature_groups = None
     self.out_feature_groups = None
     self.hyperparams = {}
     self.logger = get_logger(self)
Example #4
0
    def __init__(
        self,
        X_train: Union[pd.DataFrame, GenericDataFrame, np.ndarray,
                       None] = None,
        y_train: Union[pd.Series, np.ndarray, str, None] = None,
        X_test: Union[pd.DataFrame, GenericDataFrame, np.ndarray, None] = None,
        y_test: Union[pd.Series, np.ndarray, str, None] = None,
        dataset_metadata: Dict[str, Any] = frozenset(),
        column_descriptions: Dict[str, Union[List[str], str]] = None,
        highR_nan_threshold: float = 0.5,
    ):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray`
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_test: :class:`numpy.ndarray`
        dataset_metadata: dict
        column_descriptions: dict
            ``column_descriptions`` is a dict, key is ``feature_group``,

            value is column (column name) or columns (list of column names).

            This is a list of some frequently-used built-in ``feature_group``
                * ``id``       - id of this table.
                * ``ignore``   - some columns which contains irrelevant information.
                * ``target``   - column in the dataset is what your model will learn to predict.
                * ``nan``      - Not a Number, a column contain missing values.
                * ``num``      - numerical features, such as [1, 2, 3].
                * ``cat``      - categorical features, such as ["a", "b", "c"].
                * ``num_nan``  - numerical features contains missing values. such as [1, 2, NaN].
                * ``cat_nan``  - categorical features contains missing values. such as ["a", "b", NaN].
                * ``highR_nan``  - highly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_nan``   - lowly ratio NaN. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``highR_cat``  - highly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
                * ``lowR_cat``  -  lowly cardinality ratio categorical. You can find explain in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_nan_threshold: float
            high ratio NaN threshold, you can find examples and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`
        '''
        self.logger = get_logger(self)
        dataset_metadata = dict(dataset_metadata)
        self.highR_nan_threshold = highR_nan_threshold
        self.dataset_metadata = dataset_metadata
        X_train = deepcopy(X_train)
        y_train = deepcopy(y_train)
        X_test = deepcopy(X_test)
        y_test = deepcopy(y_test)
        X_train, y_train, X_test, y_test, feature_groups, column2feature_groups = self.parse_column_descriptions(
            column_descriptions, X_train, y_train, X_test, y_test)
        self.feature_groups = feature_groups
        self.column2feature_groups = column2feature_groups
        self.ml_task: MLTask = get_ml_task_from_y(y_train)
        self.X_train = GenericDataFrame(X_train, feature_groups=feature_groups)
        self.y_train = y_train
        self.X_test = GenericDataFrame(
            X_test,
            feature_groups=feature_groups) if X_test is not None else None
        self.y_test = y_test if y_test is not None else None

        # todo: 用户自定义验证集可以通过RandomShuffle 或者mlxtend指定
        # fixme: 不支持multilabel
        if len(y_train.shape) > 2:
            raise ValueError('y must not have more than two dimensions, '
                             'but has %d.' % len(y_train.shape))

        if X_train.shape[0] != y_train.shape[0]:
            raise ValueError('X and y must have the same number of '
                             'datapoints, but have %d and %d.' %
                             (X_train.shape[0], y_train.shape[0]))
Example #5
0
from copy import deepcopy
from typing import List, Union

import numpy as np
import pandas as pd
from pandas._typing import FrameOrSeries
from pandas.core.generic import bool_t

from autoflow.utils.logging import get_logger

logger = get_logger(__name__)


class GenericDataFrame(pd.DataFrame):
    def __init__(self, *args, **kwargs):
        # self.
        if "feature_groups" in kwargs:
            feature_groups = kwargs.pop("feature_groups")
        else:
            feature_groups = None
        if "columns_metadata" in kwargs:
            columns_metadata = kwargs.pop("columns_metadata")
        else:
            columns_metadata = None
        super(GenericDataFrame, self).__init__(*args, **kwargs)
        if feature_groups is None:
            logger.debug(
                "feature_groups is None, set it all to 'cat' feature group.")
            feature_groups = ["cat"] * self.shape[1]
        assert (len(feature_groups) == self.shape[1])
        self.set_feature_groups(pd.Series(feature_groups))
Example #6
0
    def test_pipeline(self):
        self.logger = get_logger(self)
        df = pd.read_csv("../examples/classification/train_classification.csv")
        y = df.pop("Survived").values
        df = df.loc[:, ["Sex", "Cabin", "Age"]]
        feature_groups = ["cat_nan", "cat_nan", "num_nan"]
        df_train, df_test, y_train, y_test = train_test_split(df,
                                                              y,
                                                              test_size=0.2,
                                                              random_state=10)
        df_train = GenericDataFrame(df_train, feature_groups=feature_groups)
        df_test = GenericDataFrame(df_test, feature_groups=feature_groups)
        cv = KFold(n_splits=5, random_state=10, shuffle=True)
        train_ix, valid_ix = next(cv.split(df_train))

        df_train, df_valid = df_train.split([train_ix, valid_ix])
        y_valid = y_train[valid_ix]
        y_train = y_train[train_ix]

        fill_cat = FillCat()
        fill_cat.in_feature_groups = "cat_nan"
        fill_cat.out_feature_groups = "cat"
        fill_cat.update_hyperparams({"strategy": "<NULL>"})

        fill_num = FillNum()
        fill_num.in_feature_groups = "num_nan"
        fill_num.out_feature_groups = "num"
        fill_num.update_hyperparams({"strategy": "median"})

        ohe = OneHotEncoder()
        ohe.in_feature_groups = "cat"
        ohe.out_feature_groups = "num"

        sgd = SGD()
        sgd.in_feature_groups = "num"
        sgd.update_hyperparams({"loss": "log", "random_state": 10})

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
            ("sgd", sgd),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        pred_train = pipeline.predict(df_train)
        pred_test = pipeline.predict(df_test)
        pred_valid = pipeline.predict(df_valid)
        score_valid = pipeline.predict_proba(df_valid)
        self.logger.info(accuracy_score(y_train, pred_train))
        self.logger.info(accuracy_score(y_valid, pred_valid))
        self.logger.info(accuracy_score(y_test, pred_test))
        result = pipeline.procedure(constants.binary_classification_task,
                                    df_train, y_train, df_valid, y_valid,
                                    df_test, y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))

        pipeline = GenericPipeline([
            ("fill_cat", fill_cat),
            ("fill_num", fill_num),
            ("ohe", ohe),
        ])

        pipeline.fit(df_train, y_train, df_valid, y_valid, df_test, y_test)
        ret1 = pipeline.transform(df_train, df_valid, df_test)
        ret2 = pipeline.fit_transform(df_train, y_train, df_valid, y_valid,
                                      df_test, y_test)
        for key in ["X_train", "X_valid", "X_test"]:
            assert np.all(ret1[key] == ret2[key])

        pipeline = GenericPipeline([
            ("sgd", sgd),
        ])

        result = pipeline.procedure(constants.binary_classification_task,
                                    ret1["X_train"], y_train, ret1["X_valid"],
                                    y_valid, ret1["X_test"], y_test)
        pred_test = result["pred_test"]
        pred_valid = result["pred_valid"]
        self.logger.info(
            accuracy_score(y_valid, (pred_valid > .5).astype("int")[:, 1]))
        self.logger.info(
            accuracy_score(y_test, (pred_test > .5).astype("int")[:, 1]))
Example #7
0
    def __init__(self,
                 store_path="~/autoflow",
                 file_system="local",
                 file_system_params=frozendict(),
                 db_type="sqlite",
                 db_params=frozendict(),
                 redis_params=frozendict(),
                 max_persistent_estimators=50,
                 persistent_mode="fs",
                 compress_suffix="bz2"):
        '''

        Parameters
        ----------
        store_path: str
            A path store files, such as metadata and model file and database file, which belong to AutoFlow.
        file_system: str
            Indicator-string about which file system or storage system will be used.

            Available options list below:
                * ``local``
                * ``hdfs``
                * ``s3``

            ``local`` is default value.
        file_system_params: dict
            Specific file_system configuration.
        db_type: str
            Indicator-string about which file system or storage system will be used.

            Available options list below:
                * ``sqlite``
                * ``postgresql``
                * ``mysql``

            ``sqlite`` is default value.
        db_params: dict
            Specific database configuration.
        redis_params: dict
            Redis configuration.
        max_persistent_estimators: int
            Maximal number of models can persistent in single task.

            If more than this number, the The worst performing model file will be delete,

            the corresponding database record will also be deleted.
        persistent_mode: str
            Indicator-string about which persistent mode will be used.

            Available options list below:
                * ``db`` - serialize entity to bytes and store in database directly.
                * ``fs`` - serialize entity to bytes and form a pickle file upload to storage system or save in local.
        compress_suffix: str
            compress file's suffix, default is bz2
        '''
        # --logger-------------------
        self.logger = get_logger(self)
        # --preprocessing------------
        file_system_params = dict(file_system_params)
        db_params = dict(db_params)
        redis_params = dict(redis_params)
        # ---file_system------------
        directory = os.path.split(generic_fs.__file__)[0]
        file_system2cls = find_components(generic_fs.__package__, directory,
                                          FileSystem)
        self.file_system_type = file_system
        if file_system not in file_system2cls:
            raise Exception(f"Invalid file_system {file_system}")
        self.file_system: FileSystem = file_system2cls[file_system](
            **file_system_params)
        if self.file_system_type == "local":
            store_path = os.path.expandvars(os.path.expanduser(store_path))
        self.store_path = store_path
        # ---data_base------------
        assert db_type in ("sqlite", "postgresql", "mysql")
        self.db_type = db_type
        self.db_params = dict(db_params)
        if db_type == "sqlite":
            assert self.file_system_type == "local"
        # ---redis----------------
        self.redis_params = redis_params
        # ---max_persistent_model---
        self.max_persistent_estimators = max_persistent_estimators
        # ---persistent_mode-------
        self.persistent_mode = persistent_mode
        assert self.persistent_mode in ("fs", "db")
        # ---compress_suffix------------
        self.compress_suffix = compress_suffix
        # ---post_process------------
        self.store_path = store_path
        self.file_system.mkdir(self.store_path)
        self.is_init_experiments_db = False
        self.is_init_tasks_db = False
        self.is_init_hdls_db = False
        self.is_init_trials_db = False
        self.is_init_redis = False
        self.is_master = False
        # --some specific path based on file_system---
        self.datasets_dir = self.file_system.join(self.store_path, "datasets")
        self.databases_dir = self.file_system.join(self.store_path,
                                                   "databases")
        self.parent_trials_dir = self.file_system.join(self.store_path,
                                                       "trials")
        self.parent_experiments_dir = self.file_system.join(
            self.store_path, "experiments")
        for dir_path in [
                self.datasets_dir, self.databases_dir,
                self.parent_experiments_dir, self.parent_trials_dir
        ]:
            self.file_system.mkdir(dir_path)
        # --db-----------------------------------------
        self.Datebase = get_db_class_by_db_type(self.db_type)
        # --JSONField-----------------------------------------
        if self.db_type == "sqlite":
            from playhouse.sqlite_ext import JSONField
            self.JSONField = JSONField
        elif self.db_type == "postgresql":
            from playhouse.postgres_ext import JSONField
            self.JSONField = JSONField
        elif self.db_type == "mysql":
            from playhouse.mysql_ext import JSONField
            self.JSONField = JSONField
Example #8
0
 def __init__(self):
     self.ml_task = None
     self.logger = get_logger(__name__)
Example #9
0
    def __init__(self,
                 tuner: Union[Tuner, List[Tuner], None, dict] = None,
                 hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor],
                                        None, dict] = None,
                 resource_manager: Union[ResourceManager, str] = None,
                 random_state=42,
                 log_file: str = None,
                 log_config: Optional[dict] = None,
                 highR_nan_threshold=0.5,
                 highR_cat_threshold=0.5,
                 **kwargs):
        '''
        Parameters
        ----------
        tuner: :class:`autoflow.tuner.tuner.Tuner` or None
            ``Tuner`` if class who agent an abstract search process.

        hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None
            ``HDL`` is abbreviation of Hyper-parameter Descriptions Language.

            It describes an abstract hyperparametric space that independent with concrete implementation.

            ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` .

        resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None
            ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``.

        random_state: int
            random state

        log_file: path
            which file to store log, if is None, ``autoflow.log`` will be used.

        log_config: dict
            logging configuration

        highR_nan_threshold: float
            high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_cat_threshold: float
            high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        kwargs: dict
            if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None,

            you can passing kwargs to make passed parameter work. See the following example.

        Examples
        ---------
        In this example, you can see a trick to seed kwargs parameters with out initializing
        :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class.

        In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method.
        And we can see  hdl_constructor is instanced by kwargs implicitly.

        >>> from autoflow import AutoFlowClassifier
        >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]},
        ...   hdl_bank={"classification":{"lightgbm":{"boosting_type":  {"_type": "choice", "_value":["gbdt","dart","goss"]}}}})
        AutoFlowClassifier(hdl_constructor=HDL_Constructor(
            DAG_workflow={'num->target': ['lightgbm']}
            hdl_bank_path=None
            hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}}
            included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib...
        '''
        self.log_config = log_config
        self.highR_nan_threshold = highR_nan_threshold
        self.highR_cat_threshold = highR_cat_threshold

        # ---logger------------------------------------
        self.log_file = log_file
        setup_logger(self.log_file, self.log_config)
        self.logger = get_logger(self)
        # ---random_state-----------------------------------
        self.random_state = random_state
        # ---tuner-----------------------------------
        tuner = instancing(tuner, Tuner, kwargs)
        # ---tuners-----------------------------------
        self.tuners = sequencing(tuner, Tuner)
        self.tuner = self.tuners[0]
        # ---hdl_constructor--------------------------
        hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs)
        # ---hdl_constructors-------------------------
        self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor)
        self.hdl_constructor = self.hdl_constructors[0]
        # ---resource_manager-----------------------------------
        self.resource_manager = instancing(resource_manager, ResourceManager,
                                           kwargs)
        # ---member_variable------------------------------------
        self.estimator = None
        self.ensemble_estimator = None
Example #10
0
    def __init__(
            self,
            evaluator: Union[Callable, str] = "TrainEvaluator",
            search_method: str = "smac",
            run_limit: int = 100,
            initial_runs: int = 20,
            search_method_params: dict = frozendict(),
            n_jobs: int = 1,
            exit_processes: Optional[int] = None,
            limit_resource: bool = True,
            per_run_time_limit: float = 60,
            per_run_memory_limit: float = 3072,
            time_left_for_this_task: float = None,
            debug=False
    ):
        '''

        Parameters
        ----------
        evaluator: callable, str
            ``evaluator`` is a function or callable class (implement magic method ``__call__``) or string-indicator.

            ``evaluator`` can receive a shp(SMAC Hyper Param, :class:`ConfigSpace.ConfigurationSpace`),

            and return a dict ,which contains such keys:

                * ``loss``, you can think of it as negative reward.
                * ``status``, a string , ``SUCCESS`` means fine, ``FAILED`` means crashed.

            As default,  "TrainEvaluator" is the string-indicator of :class:`autoflow.evaluation.train_evaluator.TrainEvaluator` .

        search_method: str
            Specific searching method, ``random``, ``smac``, ``grid`` are available.

                * ``random`` Random Search Algorithm,
                * ``grid``   Grid   Search Algorithm,
                * ``smac``   Bayes Search by SMAC Algorithm.

        run_limit: int
            Limitation of running step.

        initial_runs: int
            If you choose ``smac`` algorithm,

            you should realize the SMAC algorithm has a initialize procedure,

            The algorithm needs enough initial runs to get enough experience.

            This param will be omitted if ``random`` or ``grid`` is selected.

        search_method_params: dict
            Configuration for specific search method.

        n_jobs: int
            ``n_jobs`` searching process will start.

        exit_processes: int
        limit_resource: bool
            If ``limit_resource = True``, a searching trial will be killed if it use more CPU times or memory.

        per_run_time_limit: float
            will active if ``limit_resource = True``.

            a searching trial will be killed if it use CPU times more than ``per_run_time_limit``.

        per_run_memory_limit: float
            will active if ``limit_resource = True``.

            a searching trial will be killed if it use memory more than ``per_run_memory_limit``.

        time_left_for_this_task: float
            will active if ``limit_resource = True``.

            a searching task will be killed if it's totally run time more than ``time_left_for_this_task``.

        debug: bool
            For debug mode.

            Exception will be re-raised if ``debug = True``
        '''
        self.debug = debug
        self.per_run_memory_limit = per_run_memory_limit
        self.time_left_for_this_task = time_left_for_this_task
        self.per_run_time_limit = per_run_time_limit
        self.limit_resource = limit_resource
        self.logger = get_logger(self)
        if self.debug and self.limit_resource:
            self.logger.warning(
                "Tuner.debug and Tuner.limit_resource cannot be both True. set Tuner.limit_resource to False.")
            self.limit_resource = False
        search_method_params = dict(search_method_params)
        if isinstance(evaluator, str):
            if evaluator == "TrainEvaluator":
                evaluator = TrainEvaluator
            elif evaluator == "EnsembleEvaluator":
                evaluator = EnsembleEvaluator
            else:
                raise NotImplementedError
        assert callable(evaluator)
        self.evaluator_prototype = evaluator
        if inspect.isfunction(evaluator):
            self.evaluator = evaluator
        else:
            self.evaluator = evaluator()
        self.evaluator.debug = self.debug
        self.search_method_params = search_method_params
        assert search_method in ("smac", "grid", "random")
        if search_method in ("grid", "random"):
            initial_runs = 0
        self.initial_runs = initial_runs
        self.run_limit = run_limit
        self.search_method = search_method
        self.random_state = 0
        self.addition_info = {}
        self.resource_manager = None
        self.ml_task = None
        self.data_manager = None
        self.n_jobs = parse_n_jobs(n_jobs)
        if exit_processes is None:
            exit_processes = max(self.n_jobs // 3, 1)
        self.exit_processes = exit_processes
Example #11
0
    def __init__(
        self,
        DAG_workflow: Union[str, Dict[str, Any]] = "generic_recommend",
        hdl_bank_path=None,
        hdl_bank=None,
        included_classifiers=("adaboost", "catboost", "decision_tree",
                              "extra_trees", "gaussian_nb",
                              "k_nearest_neighbors", "liblinear_svc",
                              "libsvm_svc", "lightgbm", "logistic_regression",
                              "random_forest", "sgd"),
        included_regressors=("adaboost", "bayesian_ridge", "catboost",
                             "decision_tree", "elasticnet", "extra_trees",
                             "gaussian_process", "k_nearest_neighbors",
                             "kernel_ridge", "liblinear_svr", "lightgbm",
                             "random_forest", "sgd"),
        included_highR_nan_imputers=("operate.drop", {
            "_name": "operate.merge",
            "__rely_model": "boost_model"
        }),
        included_cat_nan_imputers=("impute.fill_cat", {
            "_name": "impute.fill_abnormal",
            "__rely_model": "boost_model"
        }),
        included_num_nan_imputers=("impute.fill_num", {
            "_name": "impute.fill_abnormal",
            "__rely_model": "boost_model"
        }),
        included_highR_cat_encoders=("operate.drop", "encode.label",
                                     "encode.cat_boost"),
        included_lowR_cat_encoders=("encode.one_hot", "encode.label",
                                    "encode.cat_boost"),
    ):
        '''

        Parameters
        ----------
        DAG_workflow: str or dict, default="generic_recommend"

            directed acyclic graph (DAG) workflow to describe the machine-learning procedure.

            By default, this value is  "generic_recommend", means HDL_Constructor will analyze the training data
            to recommend a valid DAG workflow.

            If you want design DAG workflow by yourself, you can seed a dict .

        hdl_bank_path: str, default=None

            ``hdl_bank`` is a json file which contains  all the hyper-parameters of the algorithm.

            ``hdl_bank_path`` is this file's path. If it is None, ``autoflow/hdl/hdl_bank.json`` will be choosed.

        hdl_bank: dict, default=None

            If you pass param ``hdl_bank_path=None`` and pass  ``hdl_bank`` as a dict,
            program will not load ``hdl_bank.json``, it uses passed ``hdl_bank`` directly.

        included_classifiers: list or tuple

            active if ``DAG_workflow="generic_recommend"``, and all of the following params will active in such situation.

            It decides which **classifiers** will consider in the algorithm selection.

        included_regressors: list or tuple

            It decides which **regressors** will consider in the algorithm selection.

        included_highR_nan_imputers: list or tuple

            ``highR_nan`` is a feature_group, means ``NaN`` has a high ratio in a column.

            for example:

            >>> from numpy import NaN
            >>> column = [1, 2, NaN, NaN, NaN]    # nan ratio is 60% , more than 50% (default highR_nan_threshold)

            ``highR_nan_imputers`` algorithms will handle such columns contain high ratio missing value.

        included_cat_nan_imputers: list or tuple

            ``cat_nan`` is a feature_group, means a categorical feature column contains ``NaN`` value.

            for example:

            >>> column = ["a", "b", "c", "d", NaN]

            ``cat_nan_imputers`` algorithms will handle such columns.

        included_num_nan_imputers: list or tuple

            ``num_nan`` is a feature_group, means a numerical feature column contains ``NaN`` value.

            for example:

            >>> column = [1, 2, 3, 4, NaN]

            ``num_nan_imputers`` algorithms will handle such columns.

        included_highR_cat_encoders: list or tuple

            ``highR_cat`` is a feature_group, means a categorical feature column contains highly cardinality ratio.

            for example:

            >>> import numpy as np
            >>> column = ["a", "b", "c", "d", "a"]
            >>> rows = len(column)
            >>> np.unique(column).size / rows  # result is 0.8 , is higher than 0.5 (default highR_cat_ratio)
            0.8
            
            ``highR_cat_imputers`` algorithms will handle such columns.

        included_lowR_cat_encoders: list or tuple
        
            ``lowR_cat`` is a feature_group, means a categorical feature column contains lowly cardinality ratio.

            for example:

            >>> import numpy as np
            >>> column = ["a", "a", "a", "d", "a"]
            >>> rows = len(column)
            >>> np.unique(column).size / rows  # result is 0.4 , is lower than 0.5 (default lowR_cat_ratio)
            0.4
            
            ``lowR_cat_imputers`` algorithms will handle such columns.

        Attributes
        ----------
        random_state: int

        ml_task: :class:`autoflow.utils.ml_task.MLTask`

        data_manager: :class:`autoflow.manager.data_manager.DataManager`

        hdl: dict
            construct by :meth:`run`

        Examples
        ----------
        >>> import numpy as np
        >>> from autoflow.manager.data_manager import DataManager
        >>> from autoflow.hdl.hdl_constructor import  HDL_Constructor
        >>> hdl_constructor = HDL_Constructor(DAG_workflow={"num->target":["lightgbm"]},
        ...   hdl_bank={"classification":{"lightgbm":{"boosting_type":  {"_type": "choice", "_value":["gbdt","dart","goss"]}}}})
        >>> data_manager = DataManager(X_train=np.random.rand(3,3), y_train=np.arange(3))
        >>> hdl_constructor.run(data_manager, 42, 0.5)
        >>> hdl_constructor.hdl
        {'preprocessing': {}, 'estimating(choice)': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}}

        '''

        self.included_lowR_cat_encoders = included_lowR_cat_encoders
        self.included_highR_cat_encoders = included_highR_cat_encoders
        self.included_num_nan_imputers = included_num_nan_imputers
        self.included_cat_nan_imputers = included_cat_nan_imputers
        self.included_highR_nan_imputers = included_highR_nan_imputers
        self.included_regressors = included_regressors
        self.included_classifiers = included_classifiers
        self.logger = get_logger(self)
        self.hdl_bank_path = hdl_bank_path
        self.DAG_workflow = DAG_workflow
        if hdl_bank is None:
            if hdl_bank_path:
                hdl_bank = get_hdl_bank(hdl_bank_path)
            else:
                hdl_bank = get_default_hdl_bank()
        if hdl_bank is None:
            hdl_bank = {}
            self.logger.warning("No hdl_bank, will use DAG_descriptions only.")
        self.hdl_bank = hdl_bank
        self.random_state = 42
        self.ml_task = None
        self.data_manager = None