Example #1
0
    def generate_trial(self, mode='fast', n_jobs=-1, time_left_for_this_task=3600, per_run_time_limit=360,memory_limit=8192,
                       initial_configurations_via_metalearning=25, ensemble_size=50, ensemble_nbest=50,
                        seed=1,include_estimators=['random_forest', 'LogisticRegressionSK', 'LogisticRegressionSMAC'],
                       exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None,
                       resampling_strategy='cv', resampling_strategy_arguments={'folds': 5},
                       tmp_folder="/tmp/autosklearn_tmp", output_folder="/tmp/autosklearn_output", delete_tmp_folder_after_terminate=True, delete_output_folder_after_terminate=True, 
                       disable_evaluator_output=False, get_smac_object_callback=None, smac_scenario_args=None, 
                       logging_config=None):
        """ generate trial's base params
        estimators list:
        # Combinations of non-linear models with feature learning:
        classifiers_ = ["adaboost", "decision_tree", "extra_trees",
                        "gradient_boosting", "k_nearest_neighbors",
                        "libsvm_svc", "random_forest", "gaussian_nb",
                        "decision_tree", "xgradient_boosting", 
                        "LogisticRegressionSK", "LogisticRegressionSMAC"]

        # Combinations of tree-based models with feature learning:
        regressors_ = ["adaboost", "decision_tree", "extra_trees",
                       "gaussian_process", "gradient_boosting",
                       "k_nearest_neighbors", "random_forest", "xgradient_boosting"]


        Keyword Arguments:
            mode {str} -- [description] (default: {'fast'})
            n_jobs {int} -- [description] (default: {-1})
            mode {str} -- 
            estimators list

        Returns:
            [type] -- [description]
        """
        n_jobs = basic.get_approp_n_jobs(n_jobs)
        if mode == 'fast':
            time_left_for_this_task = 120
            per_run_time_limit = 30
            memory_limit = 4096
            ensemble_size = 5
            ensemble_nbest = 2
        elif mode == 'big':
            ensemble_size = 50
            ensemble_nbest = 20
            memory_limit = 10240
            # ensemble_memory_limit = 4096
            time_left_for_this_task = 14400
            per_run_time_limit = 1440
        else:
            pass
        from pathlib import Path
        home_dir =str(Path.home())
        if not os.path.exists(home_dir + '/tmp'):
            os.mkdir(home_dir+"/tmp")
        
        # split to several trial, and ensemble them
        # for est in include_estimators:
        auto_sklearn_trial = create_trial(self)
        # auto_sklearn_trial.storage.clean_storage()
        train_folder = home_dir + tmp_folder + "_" + str(self.study_id) + "_" + str(auto_sklearn_trial.number)
        train_output_folder = home_dir + output_folder + "_" + str(self.study_id) + "_" + str(auto_sklearn_trial.number)
        
        if not os.path.exists(tmp_folder):
            os.mkdir(tmp_folder)
        if not os.path.exists(output_folder):
            os.mkdir(output_folder)
        self.logger.info('The tmp result will saved in {}'.format(tmp_folder))
        self.logger.info('The output of classifier will save in {}'.format(output_folder))
        self.logger.info('And it will delete tmp folder after terminate.')

        metrics_func = self._get_metric(self.metrics)

        base_params = {'n_jobs': n_jobs,
                    "time_left_for_this_task": time_left_for_this_task,
                    "per_run_time_limit": per_run_time_limit,
                    "initial_configurations_via_metalearning": initial_configurations_via_metalearning,
                    "ensemble_size": ensemble_size,
                    "ensemble_nbest": ensemble_nbest,
                    # "ensemble_memory_limit": ensemble_memory_limit,
                    "seed": seed,
                    "memory_limit": memory_limit,
                    "include_estimators": include_estimators,
                    "exclude_estimators": exclude_estimators,
                    "include_preprocessors": include_preprocessors,
                    "exclude_preprocessors": exclude_preprocessors,
                    "resampling_strategy": resampling_strategy,
                    "resampling_strategy_arguments": resampling_strategy_arguments, 
                    "tmp_folder":train_folder, "output_folder": train_output_folder, 
                    "delete_tmp_folder_after_terminate": delete_tmp_folder_after_terminate, 
                    "delete_output_folder_after_terminate": delete_output_folder_after_terminate, 
                    # "shared_mode": shared_mode, 
                    "disable_evaluator_output": disable_evaluator_output, 
                    "get_smac_object_callback": get_smac_object_callback, 
                    "smac_scenario_args": smac_scenario_args, 
                    "logging_config": logging_config,
                    'metric': metrics_func}
        # n_jobs ":  basic.get_approp_n_jobs(n_jobs)
        
        auto_sklearn_trial.clf_params = base_params
        self.trial_list.append(auto_sklearn_trial)
        return auto_sklearn_trial
Example #2
0
    def optimize(
            self, X_test, y_test,
            timeout=None,  # type: Optional[float]
            n_jobs=-1,  # type: int
            # type: Union[Tuple[()], Tuple[Type[Exception]]]
            catch=(Exception, ),
            precision=None,
    ):
        # type: (...) -> None
        """Optimize an objective function.

        Args:
            func:
                A callable that implements objective function.
            n_jobs:
                default = 1; jobs to run trials.
            timeout:
                Stop study after the given number of second(s). If this argument is set to
                :obj:`None`, the study is executed without time limitation. If :obj:`n_trials` is
                also set to :obj:`None`, the study continues to create trials until it receives a
                termination signal such as Ctrl+C or SIGTERM.
            metrics:
                metrics to optimize study.
            catch:
                A study continues to run even when a trial raises one of exceptions specified in
                this argument. Default is (`Exception <https://docs.python.org/3/library/
                exceptions.html#Exception>`_,), where all non-exit exceptions are handled
                by this logic.

        """

        X_test, y_test = check_X_y(X_test, y_test)
        if not precision:
            X_test = X_test.astype(dtype=self.precision, copy=False)
        self.storage.set_test_storage(X_test, y_test)
        del X_test
        del y_test
        gc.collect()
        # TODO Preprocess Trial
        if self.sample_method == 'lus':
            self.logger.info('Sampling training dataset with lus. Origin data shape is {0}'.format(
                str(self.storage.X_train.shape)))
            # X_train, y_train = self.storage.X_train, self.storage.y_train
            self.storage.X_train, self.storage.y_train = self.sampler.fit_transform(self.storage.X_train, self.storage.y_train)
            self.logger.info(
                'Sampling is done. Sampled data shape is {0}'.format(str(self.storage.X_train.shape)))
            # self.storage.set_train_storage(X_train, y_train)

        if self.is_autobin:
            self.logger.info("begin to autobinning data by {}  with method {}".format(
                type(self.binner), self.binner.binning_method))
            self.binner.fit(self.storage.X_train, self.storage.y_train)
            self.storage.X_train = self.binner.transform(self.storage.X_train)
            self.storage.X_test = self.binner.transform(self.storage.X_test)
            # self.storage.set_train_storage(X_train, self.storage.y_train)
            # self.storage.set_test_storage(X_test, y_test)
            self.logger.warning(
                'Binning is done. Binning would transform test_data to new bin.')
            self._pipe_add(self.binner)
        n_jobs = basic.get_approp_n_jobs(n_jobs)

        y = np.copy(self.storage.y_train)
        self.classes_ = np.unique(y)
        n_classes = len(self.classes_)
        self.n_classes = n_classes
        classes_ = self.classes_


        if self.trial_list is None or self.trial_list == []:
            self.logger.warning('no trials, init by default params.')
            self.trial_list = self._init_trials(n_jobs)
        if self.metrics in ['logloss']:
            self.storage.direction = basic.StudyDirection.MINIMIZE
        # 当前保证在Trial内进行多进程
        # if n_jobs == 1:
        #     self._optimize_sequential(self.trial_list, timeout, catch)
        # else:
        #     self._optimize_parallel(self.trial_list, timeout, n_jobs, catch)
        import warnings
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", category=RuntimeWarning)
            # do not generate clf in advanced.
            self._optimize_sequential(
                self.trial_list, timeout, catch, metrics=self.metrics)
            self._make_ensemble()
            self._pipe_add(self.best_trial.clf)
            self._export_model(self.export_model_path)