コード例 #1
0
ファイル: base.py プロジェクト: TomSirLiu/autoflow
    def __init__(self,
                 tuner: Union[Tuner, List[Tuner], None, dict] = None,
                 hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor],
                                        None, dict] = None,
                 resource_manager: Union[ResourceManager, str] = None,
                 random_state=42,
                 log_file: str = None,
                 log_config: Optional[dict] = None,
                 highR_nan_threshold=0.5,
                 highR_cat_threshold=0.5,
                 should_store_intermediate_result=False,
                 should_finally_fit=False,
                 should_calc_all_metrics=True,
                 **kwargs):
        '''
        Parameters
        ----------
        tuner: :class:`autoflow.tuner.tuner.Tuner` or None
            ``Tuner`` if class who agent an abstract search process.

        hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None
            ``HDL`` is abbreviation of Hyper-parameter Descriptions Language.

            It describes an abstract hyperparametric space that independent with concrete implementation.

            ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` .

        resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None
            ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``.

        random_state: int
            random state

        log_file: path
            which file to store log, if is None, ``autoflow.log`` will be used.

        log_config: dict
            logging configuration

        highR_nan_threshold: float
            high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        highR_cat_threshold: float
            high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor`

        kwargs
            if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None,

            you can passing kwargs to make passed parameter work. See the following example.

        Examples
        ---------
        In this example, you can see a trick to seed kwargs parameters with out initializing
        :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class.

        In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method.
        And we can see  hdl_constructor is instanced by kwargs implicitly.

        >>> from autoflow import AutoFlowClassifier
        >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]},
        ...   hdl_bank={"classification":{"lightgbm":{"boosting_type":  {"_type": "choice", "_value":["gbdt","dart","goss"]}}}})
        AutoFlowClassifier(hdl_constructor=HDL_Constructor(
            DAG_workflow={'num->target': ['lightgbm']}
            hdl_bank_path=None
            hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}}
            included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib...
        '''
        self.should_finally_fit = should_finally_fit
        self.should_store_intermediate_result = should_store_intermediate_result
        self.should_calc_all_metrics = should_calc_all_metrics
        self.log_config = log_config
        self.highR_nan_threshold = highR_nan_threshold
        self.highR_cat_threshold = highR_cat_threshold

        # ---logger------------------------------------
        self.log_file = log_file
        setup_logger(self.log_file, self.log_config)
        self.logger = get_logger(self)
        # ---random_state-----------------------------------
        self.random_state = random_state
        # ---tuner-----------------------------------
        tuner = instancing(tuner, Tuner, kwargs)
        # ---tuners-----------------------------------
        self.tuners = sequencing(tuner, Tuner)
        self.tuner = self.tuners[0]
        # ---hdl_constructor--------------------------
        hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs)
        # ---hdl_constructors-------------------------
        self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor)
        self.hdl_constructor = self.hdl_constructors[0]
        # ---resource_manager-----------------------------------
        self.resource_manager = instancing(resource_manager, ResourceManager,
                                           kwargs)
        # ---member_variable------------------------------------
        self.estimator = None
        self.ensemble_estimator = None
コード例 #2
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Author  : qichun tang
# @Contact    : [email protected]
import json
import random
from pathlib import Path

from ConfigSpace import Configuration
from joblib import load

from autoflow.opt.config_generators.bocg import BayesianOptimizationConfigGenerator
from autoflow.opt.structure import Job
from autoflow.utils.logging_ import setup_logger

setup_logger()
config_space = load("config_space.bz2")
trial_records = json.loads(Path("trial.json").read_text())
bocg = BayesianOptimizationConfigGenerator(config_space, [0, 1 / 16],
                                           min_points_in_model=20,
                                           loss_transformer="log_scaled",
                                           config_transformer_params={
                                               "impute": -1,
                                               "ohe": False
                                           })
random.seed(10)
random.shuffle(trial_records)
# warm_start
# budget=0
configs = []
losses = []
コード例 #3
0
ファイル: base.py プロジェクト: Kyrie-Long007/auto-flow
    def fit(
        self,
        X_train: Union[np.ndarray, pd.DataFrame, DataFrameContainer, str],
        y_train=None,
        X_test: Union[np.ndarray, pd.DataFrame, DataFrameContainer,
                      str] = None,
        y_test=None,
        groups=None,
        upload_type="fs",
        sub_sample_indexes=None,
        sub_feature_indexes=None,
        column_descriptions: Optional[Dict] = frozendict(),
        metric=None,
        splitter=None,
        specific_task_token="",
        dataset_metadata: dict = frozenset(),
        task_metadata: dict = frozendict(),
        additional_info: dict = frozendict(),
        fit_ensemble_params: Union[str, Dict[str, Any], None, bool] = "auto",
        is_not_realy_run=False,
    ):
        '''

        Parameters
        ----------
        X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame`
        y_train: :class:`numpy.ndarray` or :class:`pandas.Series` or str
        X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` or None
        y_test: :class:`numpy.ndarray` or :class:`pandas.Series` or str
        column_descriptions: dict
            Description about each columns' feature_group, you can find full definition in :class:`autoflow.manager.data_manager.DataManager` .
        dataset_metadata: dict
            Dataset's metadata
        metric: :class:`autoflow.metrics.Scorer` or None
            If ``metric`` is None:

            if it's classification task, :obj:`autoflow.metrics.accuracy` will be used by default.

            if it's regressor task, :obj:`autoflow.metrics.r2` will be used by default.
        should_calc_all_metrics: bool
            If ``True``, all the metrics supported in current task will be calculated, result will be store in databbase.
        splitter: object
            Default is ``KFold(5, True, 42)`` object. You can pass this param defined by yourself or other package,
            like :class:`sklearn.model_selection.StratifiedKFold`.
        specific_task_token: str
        should_store_intermediate_result: bool
        additional_info: dict
        fit_ensemble_params: str, dict, None, bool
            If this param is None, program will not do ensemble.

            If this param is "auto" or True, the top 10 models will be integrated by stacking ensemble.
        Returns
        -------
        self
        '''
        setup_logger(self.log_path, self.log_config)
        self.input_experiment_data(X_train, y_train, X_test, y_test, groups,
                                   upload_type, sub_sample_indexes,
                                   sub_feature_indexes, column_descriptions,
                                   metric, splitter, specific_task_token,
                                   dataset_metadata, task_metadata)
        if is_not_realy_run:
            return self
        self.insert_experiment_record(additional_info, fit_ensemble_params)
        self.run_nameserver()
        self.run_evaluators()
        self.run_optimizer()
        self.optimizer.shutdown(shutdown_workers=True)
        self.NS.shutdown()
        self.start_final_step(fit_ensemble_params)
        self.resource_manager.finish_experiment(self.log_path, self)
        return self
コード例 #4
0
ファイル: base.py プロジェクト: Kyrie-Long007/auto-flow
 def fit_ensemble(self,
                  task_id=None,
                  hdl_id=None,
                  trials_fetcher_cls="GetBestK",
                  trials_fetcher_params=frozendict(k=10),
                  ensemble_type="stack",
                  ensemble_params=frozendict(),
                  fit_ensemble_alone=True):
     # fixme: ensemble_params可能会面临一个问题,就是传入无法序列化的内容
     trials_fetcher_params = dict(trials_fetcher_params)
     ensemble_params = dict(ensemble_params)
     kwargs = get_valid_params_in_kwargs(self.fit_ensemble, locals())
     if task_id is None:
         assert hasattr(
             self.resource_manager,
             "task_id") and self.resource_manager.task_id is not None
         task_id = self.resource_manager.task_id
     self.task_id = task_id
     self.resource_manager.task_id = task_id
     if hdl_id is not None:
         self.hdl_id = hdl_id
         self.resource_manager.hdl_id = hdl_id
     if fit_ensemble_alone:
         setup_logger(self.log_path, self.log_config)
         if fit_ensemble_alone:
             experiment_config = {"fit_ensemble_params": kwargs}
             self.resource_manager.insert_experiment_record(
                 ExperimentType.ENSEMBLE, experiment_config, {})
             self.experiment_id = self.resource_manager.experiment_id
     from autoflow.ensemble import trials_fetcher
     assert hasattr(trials_fetcher, trials_fetcher_cls)
     trials_fetcher_cls = getattr(trials_fetcher, trials_fetcher_cls)
     trials_fetcher_inst: TrialsFetcher = trials_fetcher_cls(
         resource_manager=self.resource_manager,
         task_id=task_id,
         hdl_id=hdl_id,
         **trials_fetcher_params)
     trial_ids = trials_fetcher_inst.fetch()
     estimator_list, y_true_indexes_list, y_preds_list = TrainedDataFetcher(
         task_id, hdl_id, trial_ids, self.resource_manager).fetch()
     # todo: 在这里,只取了验证集的数据,没有取测试集的数据。待拓展
     ml_task, y_true = self.resource_manager.get_ensemble_needed_info(
         task_id)
     if len(estimator_list) == 0:
         raise ValueError("Length of estimator_list must >=1. ")
     elif len(estimator_list) == 1:
         self.logger.info(
             "Length of estimator_list == 1, don't do ensemble.")
         if ml_task.mainTask == "classification":
             ensemble_estimator = VoteClassifier(estimator_list[0])
         else:
             ensemble_estimator = MeanRegressor(estimator_list[0])
     else:
         ensemble_estimator_package_name = f"autoflow.ensemble.{ensemble_type}.{ml_task.role}"
         ensemble_estimator_package = import_module(
             ensemble_estimator_package_name)
         ensemble_estimator_class_name = get_class_name_of_module(
             ensemble_estimator_package_name)
         ensemble_estimator_class = getattr(ensemble_estimator_package,
                                            ensemble_estimator_class_name)
         # ensemble_estimator : EnsembleEstimator
         ensemble_estimator = ensemble_estimator_class(**ensemble_params)
         ensemble_estimator.fit_trained_data(estimator_list,
                                             y_true_indexes_list,
                                             y_preds_list, y_true)
     self.ensemble_estimator = ensemble_estimator
     if fit_ensemble_alone:
         self.estimator = self.ensemble_estimator
         self.resource_manager.finish_experiment(self.log_path, self)
     return self.ensemble_estimator
コード例 #5
0
ファイル: base.py プロジェクト: Kyrie-Long007/auto-flow
 def __init__(self,
              resource_manager: Union[ResourceManager, str] = None,
              hdl_constructor: Union[HDL_Constructor, None, dict] = None,
              min_budget: Optional[float] = None,
              max_budget: Optional[float] = None,
              eta: Optional[float] = None,
              SH_only: bool = False,
              budget2kfold: Optional[Dict[float, int]] = None,
              algo2budget_mode: Optional[Dict[str, str]] = None,
              algo2weight_mode: Optional[Dict[str, str]] = None,
              algo2iter: Optional[Dict[str, int]] = None,
              specific_out_feature_groups_mapper: Dict[str,
                                                       str] = frozendict({
                                                           "encode.ordinal":
                                                           "ordinal"
                                                       }),
              only_use_subsamples_budget_mode: bool = False,
              n_folds: int = 5,
              holdout_test_size: float = 1 / 3,
              n_keep_samples: int = 30000,
              min_n_samples_for_SH: int = 1000,
              max_n_samples_for_CV: int = 5000,
              warm_start=True,
              config_generator: Union[str, Type] = "ET",
              config_generator_params: dict = frozendict(),
              ns_host: str = "127.0.0.1",
              ns_port: int = 9090,
              worker_host: str = "127.0.0.1",
              master_host: str = "127.0.0.1",
              n_workers: int = 1,
              n_iterations: Optional[int] = None,
              min_n_workers: int = 1,
              concurrent_type: str = "process",
              model_registry: Dict[str, Type] = None,
              n_jobs_in_algorithm: Optional[int] = None,
              random_state: int = 42,
              log_path: str = "autoflow.log",
              log_config: Optional[dict] = None,
              highR_nan_threshold: float = 0.5,
              highC_cat_threshold: int = 4,
              consider_ordinal_as_cat: bool = False,
              should_store_intermediate_result: bool = False,
              should_finally_fit: bool = False,
              should_calc_all_metrics: bool = True,
              should_stack_X: bool = True,
              debug_evaluator: bool = False,
              initial_points=None,
              imbalance_threshold=2,
              **kwargs):
     self.imbalance_threshold = imbalance_threshold
     self.initial_points = initial_points
     self.logger = get_logger(self)
     self.specific_out_feature_groups_mapper = specific_out_feature_groups_mapper
     self.warm_start = warm_start
     self.debug_evaluator = debug_evaluator
     self.only_use_subsamples_budget_mode = only_use_subsamples_budget_mode
     if algo2iter is None:
         algo2iter = get_default_algo2iter()
     if algo2weight_mode is None:
         algo2weight_mode = get_default_algo2weight_mode()
     if algo2budget_mode is None:
         algo2budget_mode = get_default_algo2budget_mode()
     if only_use_subsamples_budget_mode:
         algo2budget_mode = {
             key: SUBSAMPLES_BUDGET_MODE
             for key in algo2budget_mode
         }
     self.algo2iter = algo2iter
     self.algo2budget_mode = algo2budget_mode
     self.algo2weight_mode = algo2weight_mode
     self.budget2kfold = budget2kfold
     self.max_n_samples_for_CV = max_n_samples_for_CV
     self.min_n_samples_for_SH = min_n_samples_for_SH
     self.n_keep_samples = n_keep_samples
     self.config_generator_params = dict(config_generator_params)
     assert isinstance(n_folds, int) and n_folds >= 1
     # fixme: support int
     assert isinstance(holdout_test_size,
                       float) and 0 < holdout_test_size < 1
     self.holdout_test_size = holdout_test_size
     self.n_folds = n_folds
     self.min_n_workers = min_n_workers
     self.master_host = master_host
     if n_jobs_in_algorithm is None:
         assert isinstance(n_workers, int) and n_workers >= 1, ValueError(
             f"Invalid n_workers {n_workers}")
         n_jobs_in_algorithm = int(
             np.clip(mp.cpu_count() // n_workers, 1, mp.cpu_count()))
         self.logger.info(
             f"`n_jobs_in_algorithm` is parsed to {n_jobs_in_algorithm}")
     self.n_jobs_in_algorithm = n_jobs_in_algorithm
     self.n_iterations = n_iterations
     self.concurrent_type = concurrent_type
     self.n_workers = n_workers
     self.worker_host = worker_host
     self.ns_port = ns_port
     self.ns_host = ns_host
     self.config_generator = config_generator
     self.SH_only = SH_only
     self.eta = eta
     self.max_budget = max_budget
     self.min_budget = min_budget
     self.should_stack_X = should_stack_X
     self.consider_ordinal_as_cat = consider_ordinal_as_cat
     if model_registry is None:
         model_registry = {}
     assert isinstance(model_registry, dict)
     for key, value in model_registry.items():
         assert inspect.isclass(value)
     self.model_registry = model_registry
     self.should_finally_fit = should_finally_fit
     self.should_store_intermediate_result = should_store_intermediate_result
     self.should_calc_all_metrics = should_calc_all_metrics
     self.log_config = log_config
     self.highR_nan_threshold = highR_nan_threshold
     self.highC_cat_threshold = highC_cat_threshold
     # ---logger------------------------------------
     self.log_path = os.path.expandvars(os.path.expanduser(log_path))
     setup_logger(self.log_path, self.log_config)
     # ---random_state-----------------------------------
     self.random_state = random_state
     # ---hdl_constructor--------------------------
     self.hdl_constructor = instancing(hdl_constructor, HDL_Constructor,
                                       kwargs)
     # ---resource_manager-----------------------------------
     self.resource_manager: ResourceManager = instancing(
         resource_manager, ResourceManager, kwargs)
     # ---member_variable------------------------------------
     self.estimator = None
     self.ensemble_estimator = None
     self.evaluators = []
     self.data_manager = None
     self.NS = None
     self.optimizer = None