def generate_trial(self, mode='fast', n_jobs=-1, time_left_for_this_task=3600, per_run_time_limit=360,memory_limit=8192, initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50, seed=1,include_estimators=['random_forest', 'LogisticRegressionSK', 'LogisticRegressionSMAC'], exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}, tmp_folder="/tmp/autosklearn_tmp", output_folder="/tmp/autosklearn_output", delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, logging_config=None): """ generate trial's base params estimators list: # Combinations of non-linear models with feature learning: classifiers_ = ["adaboost", "decision_tree", "extra_trees", "gradient_boosting", "k_nearest_neighbors", "libsvm_svc", "random_forest", "gaussian_nb", "decision_tree", "xgradient_boosting", "LogisticRegressionSK", "LogisticRegressionSMAC"] # Combinations of tree-based models with feature learning: regressors_ = ["adaboost", "decision_tree", "extra_trees", "gaussian_process", "gradient_boosting", "k_nearest_neighbors", "random_forest", "xgradient_boosting"] Keyword Arguments: mode {str} -- [description] (default: {'fast'}) n_jobs {int} -- [description] (default: {-1}) mode {str} -- estimators list Returns: [type] -- [description] """ n_jobs = basic.get_approp_n_jobs(n_jobs) if mode == 'fast': time_left_for_this_task = 120 per_run_time_limit = 30 memory_limit = 4096 ensemble_size = 5 ensemble_nbest = 2 elif mode == 'big': ensemble_size = 50 ensemble_nbest = 20 memory_limit = 10240 # ensemble_memory_limit = 4096 time_left_for_this_task = 14400 per_run_time_limit = 1440 else: pass from pathlib import Path home_dir =str(Path.home()) if not os.path.exists(home_dir + '/tmp'): os.mkdir(home_dir+"/tmp") # split to several trial, and ensemble them # for est in include_estimators: auto_sklearn_trial = create_trial(self) # auto_sklearn_trial.storage.clean_storage() train_folder = home_dir + tmp_folder + "_" + str(self.study_id) + "_" + str(auto_sklearn_trial.number) train_output_folder = home_dir + output_folder + "_" + str(self.study_id) + "_" + str(auto_sklearn_trial.number) if not os.path.exists(tmp_folder): os.mkdir(tmp_folder) if not os.path.exists(output_folder): os.mkdir(output_folder) self.logger.info('The tmp result will saved in {}'.format(tmp_folder)) self.logger.info('The output of classifier will save in {}'.format(output_folder)) self.logger.info('And it will delete tmp folder after terminate.') metrics_func = self._get_metric(self.metrics) base_params = {'n_jobs': n_jobs, "time_left_for_this_task": time_left_for_this_task, "per_run_time_limit": per_run_time_limit, "initial_configurations_via_metalearning": initial_configurations_via_metalearning, "ensemble_size": ensemble_size, "ensemble_nbest": ensemble_nbest, # "ensemble_memory_limit": ensemble_memory_limit, "seed": seed, "memory_limit": memory_limit, "include_estimators": include_estimators, "exclude_estimators": exclude_estimators, "include_preprocessors": include_preprocessors, "exclude_preprocessors": exclude_preprocessors, "resampling_strategy": resampling_strategy, "resampling_strategy_arguments": resampling_strategy_arguments, "tmp_folder":train_folder, "output_folder": train_output_folder, "delete_tmp_folder_after_terminate": delete_tmp_folder_after_terminate, "delete_output_folder_after_terminate": delete_output_folder_after_terminate, # "shared_mode": shared_mode, "disable_evaluator_output": disable_evaluator_output, "get_smac_object_callback": get_smac_object_callback, "smac_scenario_args": smac_scenario_args, "logging_config": logging_config, 'metric': metrics_func} # n_jobs ": basic.get_approp_n_jobs(n_jobs) auto_sklearn_trial.clf_params = base_params self.trial_list.append(auto_sklearn_trial) return auto_sklearn_trial
def optimize( self, X_test, y_test, timeout=None, # type: Optional[float] n_jobs=-1, # type: int # type: Union[Tuple[()], Tuple[Type[Exception]]] catch=(Exception, ), precision=None, ): # type: (...) -> None """Optimize an objective function. Args: func: A callable that implements objective function. n_jobs: default = 1; jobs to run trials. timeout: Stop study after the given number of second(s). If this argument is set to :obj:`None`, the study is executed without time limitation. If :obj:`n_trials` is also set to :obj:`None`, the study continues to create trials until it receives a termination signal such as Ctrl+C or SIGTERM. metrics: metrics to optimize study. catch: A study continues to run even when a trial raises one of exceptions specified in this argument. Default is (`Exception <https://docs.python.org/3/library/ exceptions.html#Exception>`_,), where all non-exit exceptions are handled by this logic. """ X_test, y_test = check_X_y(X_test, y_test) if not precision: X_test = X_test.astype(dtype=self.precision, copy=False) self.storage.set_test_storage(X_test, y_test) del X_test del y_test gc.collect() # TODO Preprocess Trial if self.sample_method == 'lus': self.logger.info('Sampling training dataset with lus. Origin data shape is {0}'.format( str(self.storage.X_train.shape))) # X_train, y_train = self.storage.X_train, self.storage.y_train self.storage.X_train, self.storage.y_train = self.sampler.fit_transform(self.storage.X_train, self.storage.y_train) self.logger.info( 'Sampling is done. Sampled data shape is {0}'.format(str(self.storage.X_train.shape))) # self.storage.set_train_storage(X_train, y_train) if self.is_autobin: self.logger.info("begin to autobinning data by {} with method {}".format( type(self.binner), self.binner.binning_method)) self.binner.fit(self.storage.X_train, self.storage.y_train) self.storage.X_train = self.binner.transform(self.storage.X_train) self.storage.X_test = self.binner.transform(self.storage.X_test) # self.storage.set_train_storage(X_train, self.storage.y_train) # self.storage.set_test_storage(X_test, y_test) self.logger.warning( 'Binning is done. Binning would transform test_data to new bin.') self._pipe_add(self.binner) n_jobs = basic.get_approp_n_jobs(n_jobs) y = np.copy(self.storage.y_train) self.classes_ = np.unique(y) n_classes = len(self.classes_) self.n_classes = n_classes classes_ = self.classes_ if self.trial_list is None or self.trial_list == []: self.logger.warning('no trials, init by default params.') self.trial_list = self._init_trials(n_jobs) if self.metrics in ['logloss']: self.storage.direction = basic.StudyDirection.MINIMIZE # 当前保证在Trial内进行多进程 # if n_jobs == 1: # self._optimize_sequential(self.trial_list, timeout, catch) # else: # self._optimize_parallel(self.trial_list, timeout, n_jobs, catch) import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore", category=RuntimeWarning) # do not generate clf in advanced. self._optimize_sequential( self.trial_list, timeout, catch, metrics=self.metrics) self._make_ensemble() self._pipe_add(self.best_trial.clf) self._export_model(self.export_model_path)