min_impurity_decrease=self.min_impurity_decrease,
                random_state=self.random_state,
                n_jobs=self.n_jobs,
                class_weight=self.class_weight,
                warm_start=True)

        self.estimator.fit(X, y, sample_weight=sample_weight)
        return self

    def predict(self, X):
        if self.estimator is None:
            raise NotImplementedError()
        return self.estimator.predict(X)


dataset_list = dataset_str.split(',')
check_datasets(dataset_list)
cs = get_cs()

_run_count = min(int(len(set(cs.sample_configuration(30000))) * 0.75), run_count)
print(_run_count)

for dataset in dataset_list:
    node = load_data(dataset, '../soln-ml/', True, task_type=0)
    _x, _y = node.data[0], node.data[1]
    eval = partial(eval_func, x=_x, y=_y)
    bo = BO(eval, cs, max_runs=_run_count, time_limit_per_trial=600, sample_strategy=mode, rng=np.random.RandomState(1))
    bo.run()
    with open('logs/%s-random_forest-%s-%d.pkl' % (dataset, mode, run_count), 'wb')as f:
        pickle.dump(bo.get_history().data, f)
Esempio n. 2
0
class BayesianOptimizationOptimizer(Optimizer):
    def __init__(self, task_type, input_data: DataNode, evaluator: _BaseEvaluator,
                 model_id: str, time_limit_per_trans: int,
                 mem_limit_per_trans: int,
                 seed: int, n_jobs=1,
                 number_of_unit_resource=1,
                 time_budget=600, algo='smac'):
        super().__init__(str(__class__.__name__), task_type, input_data, seed)
        self.number_of_unit_resource = number_of_unit_resource
        self.iter_num_per_unit_resource = 10
        self.time_limit_per_trans = time_limit_per_trans
        self.mem_limit_per_trans = mem_limit_per_trans
        self.time_budget = time_budget
        self.evaluator = evaluator
        self.model_id = model_id

        self.incumbent_score = -np.inf
        self.fetch_incumbent = None
        self.baseline_score = -np.inf
        self.start_time = time.time()
        self.hp_config = None
        self.seed = seed
        self.n_jobs = n_jobs

        self.node_dict = dict()

        self.early_stopped_flag = False
        self.is_finished = False
        self.iteration_id = 0

        self.evaluator.parse_needed = True
        # Prepare the hyperparameter space.
        self.hyperparameter_space = self._get_task_hyperparameter_space(optimizer=algo)
        if algo == 'smac':
            self.incumbent_config = self.hyperparameter_space.get_default_configuration()
        else:
            self.incumbent_config = None
        self.optimizer = BO(objective_function=self.evaluate_function,
                            config_space=self.hyperparameter_space,
                            max_runs=int(1e10),
                            task_id=self.model_id,
                            time_limit_per_trial=self.time_limit_per_trans,
                            rng=np.random.RandomState(self.seed))
        self.eval_dict = {}

    def evaluate_function(self, config, name='fe', data_subsample_ratio=1.0):
        """
            The config is the configuration that specifies the FE pipeline.
        :param name:
        :param data_subsample_ratio:
        :param config:
        :return: the evaluation score.
        """
        input_node = self.root_node
        output_node = self._parse(input_node, config)
        output_node.config = config
        return self.evaluator(self.hp_config, data_node=output_node, name='fe',
                              data_subsample_ratio=data_subsample_ratio)

    def optimize(self):
        """
            Interface enables user to use this FE optimizer only.
        :return:
        """
        self.is_finished = False
        while not self.is_finished:
            self.logger.debug('=' * 50)
            self.logger.debug('Start the ITERATION: %d' % self.iteration_id)
            self.logger.debug('=' * 50)
            self.iterate()
            if self.start_time + self.time_budget < time.time():
                self.is_finished = True
        return self.incumbent

    def iterate(self):
        result = None
        for _ in range(self.number_of_unit_resource):
            result = self._iterate()
        return result

    def _iterate(self):
        _start_time = time.time()
        for _ in range(self.iter_num_per_unit_resource):
            _, status, _, info = self.optimizer.iterate()
            if status == 1:
                print(info)

        runhistory = self.optimizer.get_history()
        self.eval_dict = {(fe_config, self.evaluator.hpo_config): -score for fe_config, score in
                          runhistory.data.items()}
        self.incumbent_config, iter_incumbent_score = runhistory.get_incumbents()[0]
        iter_incumbent_score = -iter_incumbent_score
        iteration_cost = time.time() - _start_time
        if iter_incumbent_score > self.incumbent_score:
            self.incumbent_score = iter_incumbent_score
            self.incumbent = self._parse(self.root_node, self.incumbent_config)

        # incumbent_score: the large the better
        return self.incumbent_score, iteration_cost, self.incumbent

    @ignore_warnings([ConvergenceWarning, UserWarning, RuntimeWarning])
    def _parse(self, data_node: DataNode, config, record=False, skip_balance=False):
        """
            Transform the data node based on the pipeline specified by configuration.
        :param data_node:
        :param config:
        :param record:
        :return: the resulting data node.
        """
        # Remove the indicator in config_dict.
        config_dict = config.get_dictionary().copy()

        pre1_id = config_dict['preprocessor1']
        config_dict.pop('preprocessor1')
        pre2_id = config_dict['preprocessor2']
        config_dict.pop('preprocessor2')
        if skip_balance:
            bal_id = 'empty'
        else:
            if 'balancer' in config_dict:
                bal_id = config_dict['balancer']
                config_dict.pop('balancer')
            else:
                bal_id = 'empty'
        gen_id = config_dict['generator']
        config_dict.pop('generator')
        res_id = config_dict['rescaler']
        config_dict.pop('rescaler')
        sel_id = config_dict['selector']
        config_dict.pop('selector')

        def tran_operate(id, tran_set, config, node):
            if id != "empty":
                _config = {}
                for key in config:
                    if id in key:
                        config_name = key.split(':')[1]
                        _config[config_name] = config[key]
                tran = tran_set[id](**_config)
                output_node = tran.operate(node)
                return output_node, tran
            return node, None

        _node = data_node.copy_()
        # Preprocessor1
        _node, pre1_tran = tran_operate(pre1_id, _preprocessor1, config_dict, _node)

        # Preprocessor2
        _node, pre2_tran = tran_operate(pre2_id, _preprocessor2, config_dict, _node)

        # Balancer
        _balancer = _bal_balancer if self.if_bal else _imb_balancer
        _node, bal_tran = tran_operate(bal_id, _balancer, config_dict, _node)

        # Rescaler
        _node, res_tran = tran_operate(res_id, _rescaler, config_dict, _node)

        # Generator
        _node, gen_tran = tran_operate(gen_id, _generator, config_dict, _node)

        # Selector
        _node, sel_tran = tran_operate(sel_id, _selector, config_dict, _node)

        _node.config = config
        if record:
            return _node, [pre1_tran, pre2_tran, bal_tran, res_tran, gen_tran, sel_tran]
        return _node

    def _get_task_hyperparameter_space(self, optimizer='tpe'):
        """
            Fetch the underlying hyperparameter space for feature engineering.
            Pipeline Space:
                1. preprocess: continous_discretizer
                2. preprocess: discrete_categorizer,
                3. balancer: weight_balancer,
                             data_balancer.
                4. scaler: normalizer, scaler, quantile.
                5. generator: all generators.
                6. selector: all selectors.
        :return: hyper space.
        """
        if self.task_type in CLS_TASKS:
            self.trans_types = TRANS_CANDIDATES['classification'].copy()
        else:
            self.trans_types = TRANS_CANDIDATES['regression'].copy()

        # Avoid transformations, which would take too long
        # Combinations of non-linear models with feature learning.
        # feature_learning = ["kitchen_sinks", "kernel_pca", "nystroem_sampler"]
        if self.task_type in CLS_TASKS:
            classifier_set = ["adaboost", "decision_tree", "extra_trees",
                              "gradient_boosting", "k_nearest_neighbors",
                              "libsvm_svc", "random_forest", "gaussian_nb",
                              "decision_tree", "lightgbm"]

            if self.model_id in classifier_set:
                for tran_id in [12, 13, 15]:
                    if tran_id in self.trans_types:
                        self.trans_types.remove(tran_id)

        generator_dict = self._get_configuration_space(_generator, self.trans_types, optimizer=optimizer)
        rescaler_dict = self._get_configuration_space(_rescaler, self.trans_types, optimizer=optimizer)
        selector_dict = self._get_configuration_space(_selector, self.trans_types, optimizer=optimizer)
        preprocessor1_dict = self._get_configuration_space(_preprocessor1, optimizer=optimizer)
        preprocessor2_dict = self._get_configuration_space(_preprocessor2, optimizer=optimizer)
        if self.task_type in CLS_TASKS:
            _balancer = _bal_balancer if self.if_bal else _imb_balancer
            balancer_dict = self._get_configuration_space(_balancer, optimizer=optimizer)
        else:
            balancer_dict = None
        cs = self._build_hierachical_configspace(preprocessor1_dict, preprocessor2_dict,
                                                 generator_dict, rescaler_dict,
                                                 selector_dict, balancer_dict, optimizer=optimizer)
        return cs

    def _get_configuration_space(self, builtin_transformers, trans_type=None, optimizer='smac'):
        config_dict = dict()
        for tran_key in builtin_transformers:
            tran = builtin_transformers[tran_key]
            tran_id = tran().type
            if trans_type is None or tran_id in trans_type:
                try:
                    sub_configuration_space = builtin_transformers[tran_key].get_hyperparameter_search_space(
                        optimizer=optimizer)
                    config_dict[tran_key] = sub_configuration_space
                except:
                    if optimizer == 'smac':
                        config_dict[tran_key] = ConfigurationSpace()
                    elif optimizer == 'tpe':
                        config_dict[tran_key] = {}
        return config_dict

    def _build_hierachical_configspace(self, pre_config1, pre_config2, gen_config, res_config, sel_config,
                                       bal_config=None, optimizer='smac'):
        if optimizer == 'smac':
            cs = ConfigurationSpace()
            self._add_hierachical_configspace(cs, pre_config1, "preprocessor1")
            self._add_hierachical_configspace(cs, pre_config2, "preprocessor2")
            if bal_config is not None:
                self._add_hierachical_configspace(cs, bal_config, "balancer")
            self._add_hierachical_configspace(cs, gen_config, "generator")
            self._add_hierachical_configspace(cs, res_config, "rescaler")
            self._add_hierachical_configspace(cs, sel_config, "selector")
            return cs
        elif optimizer == 'tpe':
            from hyperopt import hp
            space = {}

            def dict2hi(dictionary):
                hi_list = list()
                for key in dictionary:
                    hi_list.append((key, dictionary[key]))
                return hi_list

            space['preprocessor1'] = hp.choice('preprocessor1', dict2hi(pre_config1))
            space['preprocessor2'] = hp.choice('preprocessor2', dict2hi(pre_config2))
            space['generator'] = hp.choice('generator', dict2hi(gen_config))
            if bal_config is not None:
                space['balancer'] = hp.choice('balancer', dict2hi(bal_config))
            space['rescaler'] = hp.choice('rescaler', dict2hi(res_config))
            space['selector'] = hp.choice('selector', dict2hi(sel_config))
            return space

    def _add_hierachical_configspace(self, cs, config, parent_name):
        config_cand = list(config.keys())
        config_cand.append("empty")
        config_option = CategoricalHyperparameter(parent_name, config_cand,
                                                  default_value=config_cand[-1])
        cs.add_hyperparameter(config_option)
        for config_item in config_cand:
            if config_item == 'empty':
                sub_configuration_space = ConfigurationSpace()
            else:
                sub_configuration_space = config[config_item]
            parent_hyperparameter = {'parent': config_option,
                                     'value': config_item}
            cs.add_configuration_space(config_item, sub_configuration_space,
                                       parent_hyperparameter=parent_hyperparameter)

    def fetch_nodes(self, n=5):
        runhistory = self.optimizer.get_history()
        hist_dict = runhistory.data
        default_config = self.hyperparameter_space.get_default_configuration()

        if len(hist_dict) > 0:
            min_list = sorted(hist_dict.items(), key=lambda item: item[1])
        else:
            min_list = [(default_config, None)]

        if len(min_list) < 50:
            min_n = list(min_list[:n])
        else:
            amplification_rate = 3
            chosen_idxs = np.arange(n) * amplification_rate
            min_n = [min_list[idx] for idx in chosen_idxs]

        # Filter out.
        if len(hist_dict) > 0:
            default_perf = hist_dict[default_config]
            min_n = list(filter(lambda x: x[1] < default_perf, min_n))

        if default_config not in [x[0] for x in min_n]:
            # Get default configuration.
            min_n.append((default_config, runhistory.data[default_config]))

        if self.incumbent_config not in [x[0] for x in min_n]:
            min_n.append((self.incumbent_config, runhistory.data[self.incumbent_config]))

        node_list = []
        for i, config in enumerate(min_n):
            if config[0] in self.node_dict:
                node_list.append(self.node_dict[config[0]][0])
                continue

            try:
                node, tran_list = self._parse(self.root_node, config[0], record=True)
                node.config = config[0]
                if node.data[0].shape[1] == 0:
                    continue
                if self.fetch_incumbent is None:
                    self.fetch_incumbent = node  # Update incumbent node
                node_list.append(node)
                self.node_dict[config[0]] = [node, tran_list]
            except:
                print("Re-parse failed on config %s" % str(config[0]))
        return node_list

    def fetch_nodes_by_config(self, config_list):
        node_list = []
        self.logger.info("Re-parse %d configs" % len(config_list))
        for i, config in enumerate(config_list):
            try:
                if config is None:
                    config = self.hyperparameter_space.get_default_configuration()

                if config in self.node_dict:
                    node_list.append(self.node_dict[config][0])
                    continue

                node, tran_list = self._parse(self.root_node, config, record=True)
                node.config = config
                if node.data[0].shape[1] == 0:
                    continue
                if self.fetch_incumbent is None:
                    # Ensure the config_list is sorted by performance
                    self.fetch_incumbent = node  # Update incumbent node
                node_list.append(node)
                self.node_dict[config] = [node, tran_list]
                assert None not in self.node_dict
            except Exception as e:
                node_list.append(None)
                print("Re-parse failed on config %s" % str(config))
        return node_list

    def apply(self, data_node: DataNode, ref_node: DataNode, phase='test'):
        input_node = data_node.copy_()
        fe_config = ref_node.config
        if ref_node is None:
            return data_node
        elif fe_config in self.node_dict:
            fe_trans_list = self.node_dict[fe_config][1]
            for i, tran in enumerate(fe_trans_list):
                if phase == 'test' and i == 2:  # Disable balancer
                    continue
                if tran is not None:
                    input_node = tran.operate(input_node)
        else:
            self.logger.info("Ref node config:")
            self.logger.info(str(fe_config))
            self.logger.info("Node history in optimizer:")
            self.logger.info(str(self.node_dict))
            raise ValueError("Ref node not in history!")
        return input_node
Esempio n. 3
0
class SMACOptimizer(BaseHPOptimizer):
    def __init__(self,
                 evaluator,
                 config_space,
                 time_limit=None,
                 evaluation_limit=None,
                 per_run_time_limit=600,
                 per_run_mem_limit=1024,
                 output_dir='./',
                 trials_per_iter=1,
                 seed=1,
                 n_jobs=1):
        super().__init__(evaluator, config_space, seed)
        self.time_limit = time_limit
        self.evaluation_num_limit = evaluation_limit
        self.trials_per_iter = trials_per_iter
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.output_dir = output_dir

        self.optimizer = BO(objective_function=self.evaluator,
                            config_space=config_space,
                            max_runs=int(1e10),
                            task_id=None,
                            rng=np.random.RandomState(self.seed))

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        # Estimate the size of the hyperparameter space.
        hp_num = len(self.config_space.get_hyperparameters())
        if hp_num == 0:
            self.config_num_threshold = 0
        else:
            _threshold = int(
                len(set(self.config_space.sample_configuration(10000))) * 0.75)
            self.config_num_threshold = _threshold
        self.logger.debug('HP_THRESHOLD is: %d' % self.config_num_threshold)
        self.maximum_config_num = min(600, self.config_num_threshold)
        self.early_stopped_flag = False

    def run(self):
        while True:
            evaluation_num = len(self.perfs)
            if self.evaluation_num_limit is not None and evaluation_num > self.evaluation_num_limit:
                break
            if self.time_limit is not None and time.time(
            ) - self.start_time > self.time_limit:
                break
            self.iterate()
        return np.max(self.perfs)

    def iterate(self):
        _start_time = time.time()
        for _ in range(self.trials_per_iter):
            if len(self.configs) >= self.maximum_config_num:
                self.early_stopped_flag = True
                self.logger.warning(
                    'Already explored 70 percentage of the '
                    'hp space or maximum configuration number: %d!' %
                    self.maximum_config_num)
                break
            self.optimizer.iterate()

        runhistory = self.optimizer.get_history()
        self.incumbent_config, self.incumbent_perf = runhistory.get_incumbents(
        )[-1]
        self.incumbent_perf = 1 - self.incumbent_perf
        iteration_cost = time.time() - _start_time
        return self.incumbent_perf, iteration_cost, self.incumbent_config

    def optimize(self):
        self.optimizer.optimize()

        runhistory = self.optimizer.get_history()
        self.incumbent_config, self.incumbent_perf = runhistory.get_incumbents(
        )[-1]
        self.incumbent_perf = 1 - self.incumbent_perf
        return self.incumbent_config, self.incumbent_perf
class SMACOptimizer(BaseHPOptimizer):
    def __init__(self, evaluator, config_space, time_limit=None, evaluation_limit=None,
                 per_run_time_limit=300, per_run_mem_limit=1024, output_dir='./',
                 trials_per_iter=1, seed=1, n_jobs=1):
        super().__init__(evaluator, config_space, seed)
        self.time_limit = time_limit
        self.evaluation_num_limit = evaluation_limit
        self.trials_per_iter = trials_per_iter
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.output_dir = output_dir

        self.optimizer = BO(objective_function=self.evaluator,
                            config_space=config_space,
                            max_runs=int(1e10),
                            task_id=None,
                            time_limit_per_trial=self.per_run_time_limit,
                            rng=np.random.RandomState(self.seed))

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        # Estimate the size of the hyperparameter space.
        hp_num = len(self.config_space.get_hyperparameters())
        if hp_num == 0:
            self.config_num_threshold = 0
        else:
            _threshold = int(len(set(self.config_space.sample_configuration(10000))) * 0.75)
            self.config_num_threshold = _threshold
        self.logger.debug('The maximum trial number in HPO is: %d' % self.config_num_threshold)
        self.maximum_config_num = min(600, self.config_num_threshold)
        self.early_stopped_flag = False
        self.eval_dict = {}

    def run(self):
        while True:
            evaluation_num = len(self.perfs)
            if self.evaluation_num_limit is not None and evaluation_num > self.evaluation_num_limit:
                break
            if self.time_limit is not None and time.time() - self.start_time > self.time_limit:
                break
            self.iterate()
        return np.max(self.perfs)

    def iterate(self):
        _start_time = time.time()
        for _ in range(self.trials_per_iter):
            if len(self.configs) >= self.maximum_config_num:
                self.early_stopped_flag = True
                self.logger.warning('Already explored 70 percentage of the '
                                    'hyperspace or maximum configuration number met: %d!' % self.maximum_config_num)
                break
            _config, _status, _perf, _ = self.optimizer.iterate()
            if _status == SUCCESS:
                self.configs.append(_config)
                self.perfs.append(-_perf)

        runhistory = self.optimizer.get_history()
        self.eval_dict = {(None, hpo_config): -score for hpo_config, score in
                          runhistory.data.items()}
        self.incumbent_config, self.incumbent_perf = runhistory.get_incumbents()[0]
        self.incumbent_perf = -self.incumbent_perf
        iteration_cost = time.time() - _start_time
        # incumbent_perf: the large the better
        return self.incumbent_perf, iteration_cost, self.incumbent_config
Esempio n. 5
0
class SMACOptimizer(BaseOptimizer):
    def __init__(self, evaluator, config_space, name, time_limit=None, evaluation_limit=None,
                 per_run_time_limit=300, per_run_mem_limit=1024, output_dir='./',
                 inner_iter_num_per_iter=1, seed=1, n_jobs=1):
        super().__init__(evaluator, config_space, name, seed)
        self.time_limit = time_limit
        self.evaluation_num_limit = evaluation_limit
        self.inner_iter_num_per_iter = inner_iter_num_per_iter
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.output_dir = output_dir
        self.optimizer = BO(objective_function=self.evaluator,
                            config_space=config_space,
                            max_runs=int(1e10),
                            task_id=None,
                            time_limit_per_trial=self.per_run_time_limit,
                            rng=np.random.RandomState(self.seed))

        self.trial_cnt = 0
        self.configs = list()
        self.perfs = list()
        self.exp_output = dict()
        self.incumbent_perf = float("-INF")
        self.incumbent_config = self.config_space.get_default_configuration()
        # Estimate the size of the hyperparameter space.
        hp_num = len(self.config_space.get_hyperparameters())
        if hp_num == 0:
            self.config_num_threshold = 0
        else:
            _threshold = int(len(set(self.config_space.sample_configuration(5000))))
            self.config_num_threshold = _threshold
        self.logger.debug('The maximum trial number in HPO is: %d' % self.config_num_threshold)
        self.maximum_config_num = min(600, self.config_num_threshold)
        self.eval_dict = {}

    def run(self):
        while True:
            evaluation_num = len(self.perfs)
            if self.evaluation_num_limit is not None and evaluation_num > self.evaluation_num_limit:
                break
            if self.time_limit is not None and time.time() - self.start_time > self.time_limit:
                break
            self.iterate()
        return np.max(self.perfs)

    def iterate(self, budget=MAX_INT):
        _start_time = time.time()

        if len(self.configs) == 0 and self.init_hpo_iter_num is not None:
            inner_iter_num = self.init_hpo_iter_num
            print('initial hpo trial num is set to %d' % inner_iter_num)
        else:
            inner_iter_num = self.inner_iter_num_per_iter

        for _ in range(inner_iter_num):
            if len(self.configs) >= self.maximum_config_num:
                self.early_stopped_flag = True
                self.logger.warning('Already explored 70 percentage of the '
                                    'hyperspace or maximum configuration number met: %d!' % self.maximum_config_num)
                break
            if time.time() - _start_time > budget:
                self.logger.warning('Time limit exceeded!')
                break
            _config, _status, _perf, _ = self.optimizer.iterate()
            if _status == SUCCESS:
                self.exp_output[time.time()] = (_config, _perf)
                self.configs.append(_config)
                self.perfs.append(-_perf)
                self.combine_tmp_config_path()

        runhistory = self.optimizer.get_history()
        if self.name == 'hpo':
            if hasattr(self.evaluator, 'fe_config'):
                fe_config = self.evaluator.fe_config
            else:
                fe_config = None
            self.eval_dict = {(fe_config, hpo_config): [-score, time.time()] for hpo_config, score in
                              runhistory.data.items()}
        else:
            if hasattr(self.evaluator, 'hpo_config'):
                hpo_config = self.evaluator.hpo_config
            else:
                hpo_config = None
            self.eval_dict = {(fe_config, hpo_config): [-score, time.time()] for fe_config, score in
                              runhistory.data.items()}

        if len(runhistory.get_incumbents()) > 0:
            self.incumbent_config, self.incumbent_perf = runhistory.get_incumbents()[0]
            self.incumbent_perf = -self.incumbent_perf
        iteration_cost = time.time() - _start_time
        # incumbent_perf: the large the better
        return self.incumbent_perf, iteration_cost, self.incumbent_config