def __init__(self, tuner: Union[Tuner, List[Tuner], None, dict] = None, hdl_constructor: Union[HDL_Constructor, List[HDL_Constructor], None, dict] = None, resource_manager: Union[ResourceManager, str] = None, random_state=42, log_file: str = None, log_config: Optional[dict] = None, highR_nan_threshold=0.5, highR_cat_threshold=0.5, should_store_intermediate_result=False, should_finally_fit=False, should_calc_all_metrics=True, **kwargs): ''' Parameters ---------- tuner: :class:`autoflow.tuner.tuner.Tuner` or None ``Tuner`` if class who agent an abstract search process. hdl_constructor: :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or None ``HDL`` is abbreviation of Hyper-parameter Descriptions Language. It describes an abstract hyperparametric space that independent with concrete implementation. ``HDL_Constructor`` is a class who is responsible for translating dict-type ``DAG-workflow`` into ``H.D.L`` . resource_manager: :class:`autoflow.manager.resource_manager.ResourceManager` or None ``ResourceManager`` is a class manager computer resources such like ``file_system`` and ``data_base``. random_state: int random state log_file: path which file to store log, if is None, ``autoflow.log`` will be used. log_config: dict logging configuration highR_nan_threshold: float high ratio NaN threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` highR_cat_threshold: float high ratio categorical feature's cardinality threshold, you can find example and practice in :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` kwargs if parameters like ``tuner`` or ``hdl_constructor`` and ``resource_manager`` are passing None, you can passing kwargs to make passed parameter work. See the following example. Examples --------- In this example, you can see a trick to seed kwargs parameters with out initializing :class:`autoflow.hdl.hdl_constructor.HDL_Constructor` or other class. In following example, user pass ``DAG_workflow`` and ``hdl_bank`` by key-work arguments method. And we can see hdl_constructor is instanced by kwargs implicitly. >>> from autoflow import AutoFlowClassifier >>> classifier = AutoFlowClassifier(DAG_workflow={"num->target":["lightgbm"]}, ... hdl_bank={"classification":{"lightgbm":{"boosting_type": {"_type": "choice", "_value":["gbdt","dart","goss"]}}}}) AutoFlowClassifier(hdl_constructor=HDL_Constructor( DAG_workflow={'num->target': ['lightgbm']} hdl_bank_path=None hdl_bank={'classification': {'lightgbm': {'boosting_type': {'_type': 'choice', '_value': ['gbdt', 'dart', 'goss']}}}} included_classifiers=('adaboost', 'catboost', 'decision_tree', 'extra_trees', 'gaussian_nb', 'k_nearest_neighbors', 'liblinear_svc', 'lib... ''' self.should_finally_fit = should_finally_fit self.should_store_intermediate_result = should_store_intermediate_result self.should_calc_all_metrics = should_calc_all_metrics self.log_config = log_config self.highR_nan_threshold = highR_nan_threshold self.highR_cat_threshold = highR_cat_threshold # ---logger------------------------------------ self.log_file = log_file setup_logger(self.log_file, self.log_config) self.logger = get_logger(self) # ---random_state----------------------------------- self.random_state = random_state # ---tuner----------------------------------- tuner = instancing(tuner, Tuner, kwargs) # ---tuners----------------------------------- self.tuners = sequencing(tuner, Tuner) self.tuner = self.tuners[0] # ---hdl_constructor-------------------------- hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs) # ---hdl_constructors------------------------- self.hdl_constructors = sequencing(hdl_constructor, HDL_Constructor) self.hdl_constructor = self.hdl_constructors[0] # ---resource_manager----------------------------------- self.resource_manager = instancing(resource_manager, ResourceManager, kwargs) # ---member_variable------------------------------------ self.estimator = None self.ensemble_estimator = None
#!/usr/bin/env python # -*- coding: utf-8 -*- # @Author : qichun tang # @Contact : [email protected] import json import random from pathlib import Path from ConfigSpace import Configuration from joblib import load from autoflow.opt.config_generators.bocg import BayesianOptimizationConfigGenerator from autoflow.opt.structure import Job from autoflow.utils.logging_ import setup_logger setup_logger() config_space = load("config_space.bz2") trial_records = json.loads(Path("trial.json").read_text()) bocg = BayesianOptimizationConfigGenerator(config_space, [0, 1 / 16], min_points_in_model=20, loss_transformer="log_scaled", config_transformer_params={ "impute": -1, "ohe": False }) random.seed(10) random.shuffle(trial_records) # warm_start # budget=0 configs = [] losses = []
def fit( self, X_train: Union[np.ndarray, pd.DataFrame, DataFrameContainer, str], y_train=None, X_test: Union[np.ndarray, pd.DataFrame, DataFrameContainer, str] = None, y_test=None, groups=None, upload_type="fs", sub_sample_indexes=None, sub_feature_indexes=None, column_descriptions: Optional[Dict] = frozendict(), metric=None, splitter=None, specific_task_token="", dataset_metadata: dict = frozenset(), task_metadata: dict = frozendict(), additional_info: dict = frozendict(), fit_ensemble_params: Union[str, Dict[str, Any], None, bool] = "auto", is_not_realy_run=False, ): ''' Parameters ---------- X_train: :class:`numpy.ndarray` or :class:`pandas.DataFrame` y_train: :class:`numpy.ndarray` or :class:`pandas.Series` or str X_test: :class:`numpy.ndarray` or :class:`pandas.DataFrame` or None y_test: :class:`numpy.ndarray` or :class:`pandas.Series` or str column_descriptions: dict Description about each columns' feature_group, you can find full definition in :class:`autoflow.manager.data_manager.DataManager` . dataset_metadata: dict Dataset's metadata metric: :class:`autoflow.metrics.Scorer` or None If ``metric`` is None: if it's classification task, :obj:`autoflow.metrics.accuracy` will be used by default. if it's regressor task, :obj:`autoflow.metrics.r2` will be used by default. should_calc_all_metrics: bool If ``True``, all the metrics supported in current task will be calculated, result will be store in databbase. splitter: object Default is ``KFold(5, True, 42)`` object. You can pass this param defined by yourself or other package, like :class:`sklearn.model_selection.StratifiedKFold`. specific_task_token: str should_store_intermediate_result: bool additional_info: dict fit_ensemble_params: str, dict, None, bool If this param is None, program will not do ensemble. If this param is "auto" or True, the top 10 models will be integrated by stacking ensemble. Returns ------- self ''' setup_logger(self.log_path, self.log_config) self.input_experiment_data(X_train, y_train, X_test, y_test, groups, upload_type, sub_sample_indexes, sub_feature_indexes, column_descriptions, metric, splitter, specific_task_token, dataset_metadata, task_metadata) if is_not_realy_run: return self self.insert_experiment_record(additional_info, fit_ensemble_params) self.run_nameserver() self.run_evaluators() self.run_optimizer() self.optimizer.shutdown(shutdown_workers=True) self.NS.shutdown() self.start_final_step(fit_ensemble_params) self.resource_manager.finish_experiment(self.log_path, self) return self
def fit_ensemble(self, task_id=None, hdl_id=None, trials_fetcher_cls="GetBestK", trials_fetcher_params=frozendict(k=10), ensemble_type="stack", ensemble_params=frozendict(), fit_ensemble_alone=True): # fixme: ensemble_params可能会面临一个问题,就是传入无法序列化的内容 trials_fetcher_params = dict(trials_fetcher_params) ensemble_params = dict(ensemble_params) kwargs = get_valid_params_in_kwargs(self.fit_ensemble, locals()) if task_id is None: assert hasattr( self.resource_manager, "task_id") and self.resource_manager.task_id is not None task_id = self.resource_manager.task_id self.task_id = task_id self.resource_manager.task_id = task_id if hdl_id is not None: self.hdl_id = hdl_id self.resource_manager.hdl_id = hdl_id if fit_ensemble_alone: setup_logger(self.log_path, self.log_config) if fit_ensemble_alone: experiment_config = {"fit_ensemble_params": kwargs} self.resource_manager.insert_experiment_record( ExperimentType.ENSEMBLE, experiment_config, {}) self.experiment_id = self.resource_manager.experiment_id from autoflow.ensemble import trials_fetcher assert hasattr(trials_fetcher, trials_fetcher_cls) trials_fetcher_cls = getattr(trials_fetcher, trials_fetcher_cls) trials_fetcher_inst: TrialsFetcher = trials_fetcher_cls( resource_manager=self.resource_manager, task_id=task_id, hdl_id=hdl_id, **trials_fetcher_params) trial_ids = trials_fetcher_inst.fetch() estimator_list, y_true_indexes_list, y_preds_list = TrainedDataFetcher( task_id, hdl_id, trial_ids, self.resource_manager).fetch() # todo: 在这里,只取了验证集的数据,没有取测试集的数据。待拓展 ml_task, y_true = self.resource_manager.get_ensemble_needed_info( task_id) if len(estimator_list) == 0: raise ValueError("Length of estimator_list must >=1. ") elif len(estimator_list) == 1: self.logger.info( "Length of estimator_list == 1, don't do ensemble.") if ml_task.mainTask == "classification": ensemble_estimator = VoteClassifier(estimator_list[0]) else: ensemble_estimator = MeanRegressor(estimator_list[0]) else: ensemble_estimator_package_name = f"autoflow.ensemble.{ensemble_type}.{ml_task.role}" ensemble_estimator_package = import_module( ensemble_estimator_package_name) ensemble_estimator_class_name = get_class_name_of_module( ensemble_estimator_package_name) ensemble_estimator_class = getattr(ensemble_estimator_package, ensemble_estimator_class_name) # ensemble_estimator : EnsembleEstimator ensemble_estimator = ensemble_estimator_class(**ensemble_params) ensemble_estimator.fit_trained_data(estimator_list, y_true_indexes_list, y_preds_list, y_true) self.ensemble_estimator = ensemble_estimator if fit_ensemble_alone: self.estimator = self.ensemble_estimator self.resource_manager.finish_experiment(self.log_path, self) return self.ensemble_estimator
def __init__(self, resource_manager: Union[ResourceManager, str] = None, hdl_constructor: Union[HDL_Constructor, None, dict] = None, min_budget: Optional[float] = None, max_budget: Optional[float] = None, eta: Optional[float] = None, SH_only: bool = False, budget2kfold: Optional[Dict[float, int]] = None, algo2budget_mode: Optional[Dict[str, str]] = None, algo2weight_mode: Optional[Dict[str, str]] = None, algo2iter: Optional[Dict[str, int]] = None, specific_out_feature_groups_mapper: Dict[str, str] = frozendict({ "encode.ordinal": "ordinal" }), only_use_subsamples_budget_mode: bool = False, n_folds: int = 5, holdout_test_size: float = 1 / 3, n_keep_samples: int = 30000, min_n_samples_for_SH: int = 1000, max_n_samples_for_CV: int = 5000, warm_start=True, config_generator: Union[str, Type] = "ET", config_generator_params: dict = frozendict(), ns_host: str = "127.0.0.1", ns_port: int = 9090, worker_host: str = "127.0.0.1", master_host: str = "127.0.0.1", n_workers: int = 1, n_iterations: Optional[int] = None, min_n_workers: int = 1, concurrent_type: str = "process", model_registry: Dict[str, Type] = None, n_jobs_in_algorithm: Optional[int] = None, random_state: int = 42, log_path: str = "autoflow.log", log_config: Optional[dict] = None, highR_nan_threshold: float = 0.5, highC_cat_threshold: int = 4, consider_ordinal_as_cat: bool = False, should_store_intermediate_result: bool = False, should_finally_fit: bool = False, should_calc_all_metrics: bool = True, should_stack_X: bool = True, debug_evaluator: bool = False, initial_points=None, imbalance_threshold=2, **kwargs): self.imbalance_threshold = imbalance_threshold self.initial_points = initial_points self.logger = get_logger(self) self.specific_out_feature_groups_mapper = specific_out_feature_groups_mapper self.warm_start = warm_start self.debug_evaluator = debug_evaluator self.only_use_subsamples_budget_mode = only_use_subsamples_budget_mode if algo2iter is None: algo2iter = get_default_algo2iter() if algo2weight_mode is None: algo2weight_mode = get_default_algo2weight_mode() if algo2budget_mode is None: algo2budget_mode = get_default_algo2budget_mode() if only_use_subsamples_budget_mode: algo2budget_mode = { key: SUBSAMPLES_BUDGET_MODE for key in algo2budget_mode } self.algo2iter = algo2iter self.algo2budget_mode = algo2budget_mode self.algo2weight_mode = algo2weight_mode self.budget2kfold = budget2kfold self.max_n_samples_for_CV = max_n_samples_for_CV self.min_n_samples_for_SH = min_n_samples_for_SH self.n_keep_samples = n_keep_samples self.config_generator_params = dict(config_generator_params) assert isinstance(n_folds, int) and n_folds >= 1 # fixme: support int assert isinstance(holdout_test_size, float) and 0 < holdout_test_size < 1 self.holdout_test_size = holdout_test_size self.n_folds = n_folds self.min_n_workers = min_n_workers self.master_host = master_host if n_jobs_in_algorithm is None: assert isinstance(n_workers, int) and n_workers >= 1, ValueError( f"Invalid n_workers {n_workers}") n_jobs_in_algorithm = int( np.clip(mp.cpu_count() // n_workers, 1, mp.cpu_count())) self.logger.info( f"`n_jobs_in_algorithm` is parsed to {n_jobs_in_algorithm}") self.n_jobs_in_algorithm = n_jobs_in_algorithm self.n_iterations = n_iterations self.concurrent_type = concurrent_type self.n_workers = n_workers self.worker_host = worker_host self.ns_port = ns_port self.ns_host = ns_host self.config_generator = config_generator self.SH_only = SH_only self.eta = eta self.max_budget = max_budget self.min_budget = min_budget self.should_stack_X = should_stack_X self.consider_ordinal_as_cat = consider_ordinal_as_cat if model_registry is None: model_registry = {} assert isinstance(model_registry, dict) for key, value in model_registry.items(): assert inspect.isclass(value) self.model_registry = model_registry self.should_finally_fit = should_finally_fit self.should_store_intermediate_result = should_store_intermediate_result self.should_calc_all_metrics = should_calc_all_metrics self.log_config = log_config self.highR_nan_threshold = highR_nan_threshold self.highC_cat_threshold = highC_cat_threshold # ---logger------------------------------------ self.log_path = os.path.expandvars(os.path.expanduser(log_path)) setup_logger(self.log_path, self.log_config) # ---random_state----------------------------------- self.random_state = random_state # ---hdl_constructor-------------------------- self.hdl_constructor = instancing(hdl_constructor, HDL_Constructor, kwargs) # ---resource_manager----------------------------------- self.resource_manager: ResourceManager = instancing( resource_manager, ResourceManager, kwargs) # ---member_variable------------------------------------ self.estimator = None self.ensemble_estimator = None self.evaluators = [] self.data_manager = None self.NS = None self.optimizer = None