Beispiel #1
0
 def evaluate_joint_solution(self):
     # Update join incumbent from FE and HPO.
     _perf = None
     try:
         with time_limit(600):
             if self.task_type in CLS_TASKS:
                 _perf = ClassificationEvaluator(
                     self.local_inc['hpo'],
                     data_node=self.local_inc['fe'],
                     scorer=self.metric,
                     name='fe',
                     resampling_strategy=self.evaluation_type,
                     seed=self.seed)(self.local_inc['hpo'])
             else:
                 _perf = RegressionEvaluator(
                     self.local_inc['hpo'],
                     data_node=self.local_inc['fe'],
                     scorer=self.metric,
                     name='fe',
                     resampling_strategy=self.evaluation_type,
                     seed=self.seed)(self.local_inc['hpo'])
     except Exception as e:
         self.logger.error(str(e))
     # Update INC.
     if _perf is not None and np.isfinite(
             _perf) and _perf > self.incumbent_perf:
         self.inc['hpo'] = self.local_inc['hpo']
         self.inc['fe'] = self.local_inc['fe']
         self.incumbent_perf = _perf
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, iter_mode=True):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data = load_data(dataset, datanode_returned=True)
    print(set(raw_data.data[1]))
    evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data)

    if not iter_mode:
        optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=600, output_dir='logs')
        inc, val = optimizer.optimize()
        print(inc, val)
    else:
        import time
        _start_time = time.time()
        optimizer = SMACOptimizer(
            evaluator, cs, trials_per_iter=1,
            output_dir='logs', per_run_time_limit=180
        )
        results = list()
        for _iter in range(iter_num):
            perf, _, _ = optimizer.iterate()
            print(_iter, perf)
            results.append(perf)
        print(results)
        print(time.time() - _start_time)
Beispiel #3
0
def get_fe_cs(estimator_id, node, task_type=0):
    tmp_evaluator = ClassificationEvaluator(None)
    tmp_bo = AnotherBayesianOptimizationOptimizer(task_type, node,
                                                  tmp_evaluator, estimator_id,
                                                  1, 1, 1)
    cs = tmp_bo._get_task_hyperparameter_space('smac')
    return cs
def evaluate_ml_algorithm(dataset, algo, obj_metric, seed=1, task_type=None):
    print('EVALUATE-%s-%s-%s' % (dataset, algo, obj_metric))
    train_data = load_data(dataset,
                           task_type=task_type,
                           datanode_returned=True)
    print(set(train_data.data[1]))
    metric = get_metric(obj_metric)

    cs = _classifiers[algo].get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", algo)
    cs.add_hyperparameter(model)
    default_hpo_config = cs.get_default_configuration()
    hpo_evaluator = ClassificationEvaluator(default_hpo_config,
                                            scorer=metric,
                                            data_node=train_data,
                                            name='hpo',
                                            resampling_strategy='holdout',
                                            seed=seed)
    hpo_optimizer = SMACOptimizer(evaluator=hpo_evaluator,
                                  config_space=cs,
                                  per_run_time_limit=600,
                                  per_run_mem_limit=5120,
                                  output_dir='./logs',
                                  trials_per_iter=args.iter)
    hpo_optimizer.iterate()
    hpo_eval_dict = dict()
    for key, value in hpo_optimizer.eval_dict.items():
        hpo_eval_dict[key[1]] = value

    save_path = save_dir + '%s-%s-%s-hpo.pkl' % (dataset, algo, obj_metric)
    with open(save_path, 'wb') as f:
        pickle.dump(hpo_eval_dict, f)
Beispiel #5
0
def get_configspace():
    if benchmark == 'hpo':
        cs = _classifiers[algo_name].get_hyperparameter_search_space()
        model = UnParametrizedHyperparameter("estimator", algo_name)
        cs.add_hyperparameter(model)
        return cs

    train_data, test_data = load_train_test_data('splice',
                                                 task_type=MULTICLASS_CLS)
    cs = _classifiers[algo_name].get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", algo_name)
    cs.add_hyperparameter(model)
    default_hpo_config = cs.get_default_configuration()
    fe_evaluator = ClassificationEvaluator(default_hpo_config,
                                           scorer=metric,
                                           name='fe',
                                           resampling_strategy='holdout',
                                           seed=1)
    fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION,
                                                 input_data=train_data,
                                                 evaluator=fe_evaluator,
                                                 model_id=algo_name,
                                                 time_limit_per_trans=600,
                                                 mem_limit_per_trans=5120,
                                                 number_of_unit_resource=10,
                                                 seed=1)
    hyper_space = fe_optimizer.hyperparameter_space
    return hyper_space
Beispiel #6
0
    def evaluate(_config):
        _config = _config.get_dictionary()
        # print(_config)
        arm = None
        cs = ConfigurationSpace()
        for key in _config:
            key_str = key.split(":")
            if key_str[0] == 'classifier':
                if key_str[1] == '__choice__':
                    arm = _config[key]
                    cs.add_hyperparameter(UnParametrizedHyperparameter("estimator", _config[key]))
                else:
                    cs.add_hyperparameter(UnParametrizedHyperparameter(key_str[2], _config[key]))

        if arm in first_bandit.arms:
            transformed_node = apply_metalearning_fe(first_bandit.sub_bandits[arm].optimizer['fe'], _config)
            default_config = cs.sample_configuration(1)
            hpo_evaluator = ClassificationEvaluator(None,
                                                    data_node=transformed_node, name='hpo',
                                                    resampling_strategy=first_bandit.eval_type,
                                                    seed=first_bandit.seed)

            start_time = time.time()
            score1 = 1 - hpo_evaluator(default_config)
            time_cost1 = time.time() - start_time

            # Evaluate the default config
            start_time = time.time()
            score2 = 1 - hpo_evaluator(first_bandit.sub_bandits[arm].default_config)
            time_cost2 = time.time() - start_time
            transformed_node.score2 = max(score1, score2)

            return (arm, score1, default_config, transformed_node, time_cost1), (
                arm, score2, first_bandit.sub_bandits[arm].default_config, transformed_node, time_cost2)
Beispiel #7
0
def evaluate_bo_optimizer(dataset, time_limit, run_id, seed):
    from solnml.components.fe_optimizers.bo_optimizer import BayesianOptimizationOptimizer
    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    cls_task_type = BINARY_CLS if len(set(
        train_data.data[1])) == 2 else MULTICLASS_CLS
    optimizer = BayesianOptimizationOptimizer(cls_task_type,
                                              train_data,
                                              evaluator,
                                              'random_forest',
                                              300,
                                              10000,
                                              seed,
                                              time_budget=time_limit)
    optimizer.optimize()
    inc = optimizer.incumbent_config
    val_score = 1 - optimizer.evaluate_function(inc)
    print(val_score)
    print(optimizer.incumbent_score)

    optimizer.fetch_nodes(n=10)
    print("Refit finished!")

    final_train_data = optimizer.apply(train_data,
                                       optimizer.incumbent,
                                       phase='train')
    X_train, y_train = final_train_data.data
    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_test, y_test = final_test_data.data

    clf = fetch_predict_estimator(
        cls_task_type,
        cs.get_default_configuration(),
        X_train,
        y_train,
        weight_balance=final_train_data.enable_balance,
        data_balance=final_train_data.data_balance)
    y_pred = clf.predict(X_test)

    from solnml.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'bo_fe_%s_%d_%d.pkl' % (dataset, time_limit, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
Beispiel #8
0
    def evaluate_joint_solution(self):
        # Update join incumbent from FE and HPO.
        _perf = None
        try:
            with time_limit(self.per_run_time_limit):
                if self.task_type in CLS_TASKS:
                    evaluator = ClassificationEvaluator(
                        self.local_inc['hpo'],
                        self.local_inc['fe'],
                        self.estimator_id,
                        data_node=self.original_data,
                        scorer=self.metric,
                        if_imbal=self.if_imbal,
                        name='hpo',
                        resampling_strategy=self.evaluation_type,
                        seed=self.seed,
                        output_dir=self.output_dir,
                        timestamp=self.timestamp)
                else:
                    evaluator = RegressionEvaluator(
                        self.local_inc['hpo'],
                        self.local_inc['fe'],
                        self.estimator_id,
                        data_node=self.original_data,
                        scorer=self.metric,
                        name='hpo',
                        resampling_strategy=self.evaluation_type,
                        seed=self.seed,
                        output_dir=self.output_dir,
                        timestamp=self.timestamp)
                _perf = -evaluator(self.local_inc['hpo'])
        except Exception as e:
            self.logger.error(str(e))

        # TODO: Need refactoring!
        sorted_list_path = evaluator.topk_model_saver.sorted_list_path
        path_list = os.path.split(sorted_list_path)
        tmp_path = 'tmp_' + path_list[-1]
        tmp_filepath = os.path.join(os.path.dirname(sorted_list_path),
                                    tmp_path)

        # TODO: How to merge when using multi-process
        if os.path.exists(tmp_filepath):
            self.logger.info('Temporary config path detected!')
            with open(tmp_filepath, 'rb') as f1:
                sorted_file_replica = pkl.load(f1)
            with open(sorted_list_path, 'wb') as f2:
                pkl.dump(sorted_file_replica, f2)
            self.logger.info('Temporary config path merged!')

        # Update INC.
        if _perf is not None and np.isfinite(
                _perf) and _perf > self.incumbent_perf:
            self.inc['hpo'] = self.local_inc['hpo']
            self.inc['fe'] = self.local_inc['fe']
            self.incumbent_perf = _perf
Beispiel #9
0
def conduct_fe(dataset='pc4',
               classifier_id='random_forest',
               iter_num=100,
               run_id=0,
               seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)
    default_config = cs.get_default_configuration()

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(default_config,
                                        name='fe',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    val_acc = evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = EvaluationBasedOptimizer(task_type='classification',
                                         input_data=raw_data,
                                         evaluator=evaluator,
                                         model_id=classifier_id,
                                         time_limit_per_trans=240,
                                         mem_limit_per_trans=10000,
                                         seed=seed)

    task_id = 'fe-%s-%s-%d' % (dataset, classifier_id, iter_num)
    val_acc_list, test_acc_list = [], []

    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, incubent = optimizer.iterate()
        val_acc_list.append(perf)
        train_node = optimizer.apply(raw_data, incubent)
        test_node = optimizer.apply(test_raw_data, incubent)
        estimator = fetch_predict_estimator(default_config, train_node.data[0],
                                            train_node.data[1])
        pred = estimator.predict(test_node.data[0])
        test_perf = balanced_accuracy(test_node.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
Beispiel #10
0
def conduct_hpo(dataset='pc4',
                classifier_id='random_forest',
                iter_num=100,
                run_id=0,
                seed=1):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='hpo',
                                        data_node=raw_data,
                                        resampling_strategy='holdout',
                                        seed=seed)

    default_config = cs.get_default_configuration()
    val_acc = 1. - evaluator(default_config)
    estimator = fetch_predict_estimator(default_config, raw_data.data[0],
                                        raw_data.data[1])
    pred = estimator.predict(test_raw_data.data[0])
    test_acc = balanced_accuracy(test_raw_data.data[1], pred)

    optimizer = SMACOptimizer(evaluator,
                              cs,
                              trials_per_iter=2,
                              output_dir='logs',
                              per_run_time_limit=180)
    task_id = 'hpo-%s-%s-%d' % (dataset, classifier_id, iter_num)

    val_acc_list, test_acc_list = [], []
    val_acc_list.append(val_acc)
    test_acc_list.append(test_acc)

    for _iter in range(iter_num):
        perf, _, config = optimizer.iterate()
        val_acc_list.append(perf)
        estimator = fetch_predict_estimator(config, raw_data.data[0],
                                            raw_data.data[1])
        pred = estimator.predict(test_raw_data.data[0])
        test_perf = balanced_accuracy(test_raw_data.data[1], pred)
        test_acc_list.append(test_perf)
        print(val_acc_list)
        print(test_acc_list)

    save_path = save_dir + '%s-%d.pkl' % (task_id, run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([val_acc_list, test_acc_list], f)
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc',
                                cv='holdout', time_limit=120000, seed=1):
    train_data, test_data = load_train_test_data(dataset)
    bandit = SecondLayerBandit(algo, train_data, dataset_id=dataset, mth=mth, seed=seed, eval_type=cv)

    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
Beispiel #12
0
def conduct_hpo(optimizer='smac', dataset='pc4', classifier_id='random_forest', runcount_limit=100):
    from autosklearn.pipeline.components.classification import _classifiers

    clf_class = _classifiers[classifier_id]
    cs = clf_class.get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", classifier_id)
    cs.add_hyperparameter(model)

    raw_data = load_data(dataset, datanode_returned=True)
    print(set(raw_data.data[1]))
    evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data)

    if optimizer == 'smac':
        optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=runcount_limit, output_dir='logs')
    elif optimizer == 'psmac':
        optimizer = PSMACOptimizer(evaluator, cs, args.n, evaluation_limit=runcount_limit, output_dir='logs',
                                   trials_per_iter=args.trial)
    perf, cost, config = optimizer.iterate()
    print(perf, cost, config)
    perf, cost, config = optimizer.iterate()
    print(perf, cost, config)
Beispiel #13
0
    def __init__(self,
                 scorer=None,
                 data_node=None,
                 task_type=0,
                 resampling_strategy='cv',
                 resampling_params=None,
                 seed=1):
        self.resampling_strategy = resampling_strategy
        self.resampling_params = resampling_params
        self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
        self.data_node = data_node
        self.seed = seed
        self.eval_id = 0
        self.onehot_encoder = None
        self.logger = get_logger(self.__module__ + "." +
                                 self.__class__.__name__)
        self.continue_training = False

        tmp_evaluator = ClassificationEvaluator(None)
        self.tmp_bo = AnotherBayesianOptimizationOptimizer(
            task_type, data_node, tmp_evaluator, 'adaboost', 1, 1, 1)
def evaluate_fe_bugs(dataset, run_id, time_limit, seed):
    algorithms = [
        'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost',
        'random_forest', 'extra_trees', 'decision_tree'
    ]
    algo_id = np.random.choice(algorithms, 1)[0]
    task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id)
    print(task_id)

    # Prepare the configuration for random forest.
    clf_class = _classifiers[algo_id]
    cs = clf_class.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", algo_id)
    cs.add_hyperparameter(clf_hp)
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    pipeline = FEPipeline(fe_enabled=True,
                          optimizer_type='eval_base',
                          time_budget=time_limit,
                          evaluator=evaluator,
                          seed=seed,
                          model_id=algo_id,
                          time_limit_per_trans=per_run_time_limit,
                          task_id=task_id)

    raw_data, test_raw_data = load_train_test_data(dataset)
    train_data = pipeline.fit_transform(raw_data.copy_())
    test_data = pipeline.transform(test_raw_data.copy_())
    train_data_new = pipeline.transform(raw_data.copy_())

    assert (train_data.data[0] == train_data_new.data[0]).all()
    assert (train_data.data[1] == train_data_new.data[1]).all()
    assert (train_data_new == train_data)

    score = evaluator(None, data_node=test_data)
    print('==> Test score', score)
Beispiel #15
0
def evaluate(dataset):
    train_data, test_data = load_train_test_data(dataset, test_size=0.3, task_type=MULTICLASS_CLS)

    cs = _classifiers[algo_name].get_hyperparameter_search_space()
    default_hpo_config = cs.get_default_configuration()
    metric = get_metric('bal_acc')

    fe_cs = get_task_hyperparameter_space(0, algo_name)
    default_fe_config = fe_cs.get_default_configuration()

    evaluator = ClassificationEvaluator(default_hpo_config, default_fe_config, algo_name,
                                        data_node=train_data,
                                        scorer=metric,
                                        name='hpo',
                                        resampling_strategy='holdout',
                                        output_dir='./data/exp_sys',
                                        seed=1)

    from solnml.components.optimizers.tlbo_optimizer import TlboOptimizer

    optimizer = TlboOptimizer(evaluator, cs, time_limit=300, name='hpo')
    optimizer.run()
def evaluate(mode, dataset, run_id, metric):
    print(mode, dataset, run_id, metric)

    metric = get_metric(metric)
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)

    cs = _classifiers[algo_name].get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", algo_name)
    cs.add_hyperparameter(model)
    default_hpo_config = cs.get_default_configuration()

    fe_evaluator = ClassificationEvaluator(default_hpo_config,
                                           scorer=metric,
                                           name='fe',
                                           resampling_strategy='holdout',
                                           seed=1)

    hpo_evaluator = ClassificationEvaluator(default_hpo_config,
                                            scorer=metric,
                                            data_node=train_data,
                                            name='hpo',
                                            resampling_strategy='holdout',
                                            seed=1)

    fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION,
                                                 input_data=train_data,
                                                 evaluator=fe_evaluator,
                                                 model_id=algo_name,
                                                 time_limit_per_trans=600,
                                                 mem_limit_per_trans=5120,
                                                 number_of_unit_resource=10,
                                                 seed=1)

    def objective_function(config):
        if benchmark == 'fe':
            return fe_optimizer.evaluate_function(config)
        else:
            return hpo_evaluator(config)

    meta_feature_vec = metafeature_dict[dataset]
    past_datasets = test_datasets.copy()
    if dataset in past_datasets:
        past_datasets.remove(dataset)
    past_history = load_runhistory(past_datasets)

    tlbo = TLBO_AF(objective_function,
                   config_space,
                   past_history,
                   dataset_metafeature=meta_feature_vec,
                   max_runs=max_runs,
                   acq_method='taff2')

    tlbo.run()
    print('TLBO result')
    print(tlbo.get_incumbent())
    runs = [tlbo.configurations, tlbo.perfs]
    perf = tlbo.history_container.incumbent_value

    file_saved = '%s_%s_result_%d_%d_%s.pkl' % (mode, dataset, max_runs,
                                                run_id, benchmark)
    with open(data_dir + file_saved, 'wb') as f:
        pk.dump([perf, runs], f)
Beispiel #17
0
def evaluate_evaluation_based_fe(dataset, time_limit, run_id, seed):
    from solnml.components.fe_optimizers.evaluation_based_optimizer import EvaluationBasedOptimizer

    # Prepare the configuration for random forest.
    from ConfigSpace.hyperparameters import UnParametrizedHyperparameter
    from autosklearn.pipeline.components.classification.random_forest import RandomForest
    cs = RandomForest.get_hyperparameter_search_space()
    clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest')
    cs.add_hyperparameter(clf_hp)
    print(cs.get_default_configuration())
    """
    Configuration:
      bootstrap, Value: 'True'
      criterion, Value: 'gini'
      estimator, Constant: 'random_forest'
      max_depth, Constant: 'None'
      max_features, Value: 0.5
      max_leaf_nodes, Constant: 'None'
      min_impurity_decrease, Constant: 0.0
      min_samples_leaf, Value: 1
      min_samples_split, Value: 2
      min_weight_fraction_leaf, Constant: 0.0
      n_estimators, Constant: 100
    """
    evaluator = ClassificationEvaluator(cs.get_default_configuration(),
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')

    train_data, test_data = load_train_test_data(dataset)
    optimizer = EvaluationBasedOptimizer(MULTICLASS_CLS,
                                         train_data,
                                         evaluator,
                                         'random_forest',
                                         300,
                                         10000,
                                         seed,
                                         trans_set=None)

    _start_time = time.time()
    _iter_id = 0
    while True:
        if time.time(
        ) > _start_time + time_limit or optimizer.early_stopped_flag:
            break
        score, iteration_cost, inc = optimizer.iterate()
        print('%d - %.4f' % (_iter_id, score))
        _iter_id += 1

    final_train_data = optimizer.apply(train_data, optimizer.incumbent)
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, score)

    final_test_data = optimizer.apply(test_data, optimizer.incumbent)
    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(MULTICLASS_CLS,
                                  cs.get_default_configuration(), X_train,
                                  y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)

    from solnml.components.metrics.cls_metrics import balanced_accuracy
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    save_path = save_dir + 'hmab_fe_%s_%d_%d.pkl' % (dataset, time_limit,
                                                     run_id)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score], f)
Beispiel #18
0
    def evaluate_joint_perf(self):
        # Update join incumbent from FE and HPO.
        _perf = None
        try:
            with time_limit(self.per_run_time_limit):
                if self.task_type in CLS_TASKS:
                    from solnml.components.evaluators.cls_evaluator import ClassificationEvaluator
                    evaluator = ClassificationEvaluator(
                        self.local_inc['fe'].copy(),
                        scorer=self.metric,
                        data_node=self.original_data,
                        if_imbal=self.if_imbal,
                        timestamp=self.timestamp,
                        seed=self.seed,
                        output_dir=self.output_dir,
                        resampling_strategy=self.eval_type,
                        resampling_params=self.resampling_params)
                else:
                    from solnml.components.evaluators.rgs_evaluator import RegressionEvaluator
                    evaluator = RegressionEvaluator(
                        self.local_inc['fe'].copy(),
                        scorer=self.metric,
                        data_node=self.original_data,
                        timestamp=self.timestamp,
                        seed=self.seed,
                        output_dir=self.output_dir,
                        resampling_strategy=self.eval_type,
                        resampling_params=self.resampling_params)
                _perf = -evaluator(self.local_inc['hpo'].copy())
        except Exception as e:
            self.logger.error(str(e))

        if _perf is not None and np.isfinite(_perf):
            _config = self.local_inc['fe'].copy()
            _config.update(self.local_inc['hpo'].copy())

            classifier_id = _config['algorithm']
            # -perf: The larger, the better.
            save_flag, model_path, delete_flag, model_path_deleted = self.topk_saver.add(
                _config, -_perf, classifier_id)
            # By default, the evaluator has already stored the models.
            if self.eval_type in ['holdout', 'partial']:
                if save_flag:
                    pass
                else:
                    os.remove(model_path)
                    self.logger.info("Model deleted from %s" % model_path)

                try:
                    if delete_flag:
                        os.remove(model_path_deleted)
                        self.logger.info("Model deleted from %s" %
                                         model_path_deleted)
                    else:
                        pass
                except:
                    pass
            self.eval_dict[(self.local_inc['fe'].copy(),
                            self.local_inc['hpo'].copy())] = [
                                _perf, time.time(), SUCCESS
                            ]
            self.topk_saver.save_topk_config()
        else:
            self.eval_dict[(self.local_inc['fe'].copy(),
                            self.local_inc['hpo'].copy())] = [
                                _perf, time.time(), FAILED
                            ]

        # Update INC.
        if _perf is not None and np.isfinite(
                _perf) and _perf > self.incumbent_perf:
            self.inc['hpo'] = self.local_inc['hpo']
            self.inc['fe'] = self.local_inc['fe']
            self.incumbent_perf = _perf
            _incumbent = dict()
            _incumbent.update(self.inc['fe'])
            _incumbent.update(self.inc['hpo'])
            self.incumbent = _incumbent.copy()
def evaluate(mth, dataset, run_id):
    print(mth, dataset, run_id)
    train_data, test_data = load_train_test_data(dataset,
                                                 test_size=0.3,
                                                 task_type=MULTICLASS_CLS)

    cs = _classifiers[algo_name].get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", algo_name)
    cs.add_hyperparameter(model)
    default_hpo_config = cs.get_default_configuration()
    metric = get_metric('bal_acc')

    fe_evaluator = ClassificationEvaluator(default_hpo_config,
                                           scorer=metric,
                                           name='fe',
                                           resampling_strategy='holdout',
                                           seed=1)
    fe_optimizer = BayesianOptimizationOptimizer(task_type=MULTICLASS_CLS,
                                                 input_data=train_data,
                                                 evaluator=fe_evaluator,
                                                 model_id=algo_name,
                                                 time_limit_per_trans=600,
                                                 mem_limit_per_trans=5120,
                                                 number_of_unit_resource=10,
                                                 seed=1)
    config_space = fe_optimizer.hyperparameter_space

    def objective_function(config):
        return fe_optimizer.evaluate_function(config)

    if mth == 'gp_bo':
        bo = BO(objective_function, config_space, max_runs=max_runs)
        bo.run()
        print('new BO result')
        print(bo.get_incumbent())
        perf_bo = bo.history_container.incumbent_value
    elif mth == 'lite_bo':
        from litebo.facade.bo_facade import BayesianOptimization
        bo = BayesianOptimization(objective_function,
                                  config_space,
                                  max_runs=max_runs)
        bo.run()
        print('lite BO result')
        print(bo.get_incumbent())
        perf_bo = bo.history_container.incumbent_value
    elif mth == 'smac':
        from smac.scenario.scenario import Scenario
        from smac.facade.smac_facade import SMAC
        # Scenario object
        scenario = Scenario({
            "run_obj": "quality",
            "runcount-limit": max_runs,
            "cs": config_space,
            "deterministic": "true"
        })
        smac = SMAC(scenario=scenario,
                    rng=np.random.RandomState(42),
                    tae_runner=objective_function)
        incumbent = smac.optimize()
        perf_bo = objective_function(incumbent)
        print('SMAC BO result')
        print(perf_bo)
    else:
        raise ValueError('Invalid method.')
    return perf_bo
eval_type = 'holdout'
output_dir = args.output_dir
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

for dataset in dataset_list:
    train_data, test_data = load_train_test_data(dataset)
    for algo in algorithms:
        cs = _estimators[algo].get_hyperparameter_search_space()
        model = UnParametrizedHyperparameter("estimator", algo)
        cs.add_hyperparameter(model)
        default_hpo_config = cs.get_default_configuration()

        if task == 'cls':
            fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric,
                                                   name='fe', resampling_strategy=eval_type,
                                                   seed=1)
            hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric,
                                                    data_node=train_data, name='hpo',
                                                    resampling_strategy=eval_type,
                                                    seed=1)
        else:
            fe_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric,
                                               name='fe', resampling_strategy=eval_type,
                                               seed=1)
            hpo_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric,
                                                data_node=train_data, name='hpo',
                                                resampling_strategy=eval_type,
                                                seed=1)

        fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION if task == 'cls' else REGRESSION,
Beispiel #21
0
    def __init__(self, node_list, node_index,
                 task_type, timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 time_limit=None,
                 trial_num=0,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        super(JointBlock, self).__init__(node_list, node_index, task_type, timestamp,
                                         fe_config_space, cash_config_space, data,
                                         fixed_config=fixed_config,
                                         time_limit=time_limit,
                                         trial_num=trial_num,
                                         metric=metric,
                                         ensemble_method=ensemble_method,
                                         ensemble_size=ensemble_size,
                                         per_run_time_limit=per_run_time_limit,
                                         output_dir=output_dir,
                                         dataset_name=dataset_name,
                                         eval_type=eval_type,
                                         resampling_params=resampling_params,
                                         n_jobs=n_jobs,
                                         seed=seed)

        self.fixed_config = fixed_config

        # Combine configuration space
        cs = ConfigurationSpace()
        if fe_config_space is not None:
            cs.add_hyperparameters(fe_config_space.get_hyperparameters())
            cs.add_conditions(fe_config_space.get_conditions())
            cs.add_forbidden_clauses(fe_config_space.get_forbiddens())
        if cash_config_space is not None:
            cs.add_hyperparameters(cash_config_space.get_hyperparameters())
            cs.add_conditions(cash_config_space.get_conditions())
            cs.add_forbidden_clauses(cash_config_space.get_forbiddens())
        self.joint_cs = cs

        # Define evaluator and optimizer
        if self.task_type in CLS_TASKS:
            from solnml.components.evaluators.cls_evaluator import ClassificationEvaluator
            self.evaluator = ClassificationEvaluator(
                fixed_config=fixed_config,
                scorer=self.metric,
                data_node=self.original_data,
                if_imbal=self.if_imbal,
                timestamp=self.timestamp,
                output_dir=self.output_dir,
                seed=self.seed,
                resampling_strategy=self.eval_type,
                resampling_params=self.resampling_params)
        else:
            from solnml.components.evaluators.rgs_evaluator import RegressionEvaluator
            self.evaluator = RegressionEvaluator(
                fixed_config=fixed_config,
                scorer=self.metric,
                data_node=self.original_data,
                timestamp=self.timestamp,
                output_dir=self.output_dir,
                seed=self.seed,
                resampling_strategy=self.eval_type,
                resampling_params=self.resampling_params)

        self.optimizer = build_hpo_optimizer(self.eval_type, self.evaluator, self.joint_cs,
                                             output_dir=self.output_dir,
                                             per_run_time_limit=self.per_run_time_limit,
                                             inner_iter_num_per_iter=1,
                                             timestamp=self.timestamp,
                                             seed=self.seed, n_jobs=self.n_jobs)
Beispiel #22
0
    def __init__(self,
                 task_type,
                 estimator_id: str,
                 data: DataNode,
                 metric,
                 share_fe=False,
                 output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb',
                 sw_size=3,
                 n_jobs=1,
                 seed=1,
                 fe_algo='tree_based',
                 enable_intersection=True,
                 number_of_unit_resource=2,
                 total_resource=30):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.total_resource = total_resource
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.n_jobs = n_jobs
        self.mth = mth
        self.seed = seed
        self.sliding_window_size = sw_size
        task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id)
        self.logger = get_logger(self.__class__.__name__ + '-' + task_id)
        np.random.seed(self.seed)

        # Bandit settings.
        # self.arms = ['fe', 'hpo']
        self.arms = ['hpo', 'fe']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.enable_intersection = enable_intersection

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from solnml.components.models.classification import _classifiers, _addons
            if estimator_id in _classifiers:
                clf_class = _classifiers[estimator_id]
            elif estimator_id in _addons.components:
                clf_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        elif self.task_type in REG_TASKS:
            from solnml.components.models.regression import _regressors, _addons
            if estimator_id in _regressors:
                reg_class = _regressors[estimator_id]
            elif estimator_id in _addons.components:
                reg_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        elif self.task_type in REG_TASKS:
            fe_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        else:
            raise ValueError('Invalid task type!')

        self.fe_algo = fe_algo
        self.optimizer['fe'] = build_fe_optimizer(self.fe_algo,
                                                  self.evaluation_type,
                                                  self.task_type,
                                                  self.original_data,
                                                  fe_evaluator,
                                                  estimator_id,
                                                  per_run_time_limit,
                                                  per_run_mem_limit,
                                                  self.seed,
                                                  shared_mode=self.share_fe,
                                                  n_jobs=n_jobs)

        self.inc['fe'], self.local_inc[
            'fe'] = self.original_data, self.original_data

        # Build the HPO component.
        # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

        self.optimizer['hpo'] = build_hpo_optimizer(
            self.evaluation_type,
            hpo_evaluator,
            cs,
            output_dir=output_dir,
            per_run_time_limit=per_run_time_limit,
            trials_per_iter=trials_per_iter,
            seed=self.seed,
            n_jobs=n_jobs)

        self.inc['hpo'], self.local_inc[
            'hpo'] = self.default_config, self.default_config
        self.init_config = cs.get_default_configuration()
        self.local_hist['fe'].append(self.original_data)
        self.local_hist['hpo'].append(self.default_config)
Beispiel #23
0
    def __init__(self,
                 task_type,
                 estimator_id: str,
                 data: DataNode,
                 metric,
                 include_preprocessors=None,
                 share_fe=False,
                 output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb',
                 sw_size=3,
                 n_jobs=1,
                 seed=1,
                 enable_fe=True,
                 fe_algo='bo',
                 number_of_unit_resource=2,
                 total_resource=30,
                 timestamp=None):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.total_resource = total_resource
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.include_preprocessors = include_preprocessors
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.n_jobs = n_jobs
        self.mth = mth
        self.seed = seed
        self.sliding_window_size = sw_size
        task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id)
        self.logger = get_logger(self.__class__.__name__ + '-' + task_id)

        # Bandit settings.
        # self.arms = ['fe', 'hpo']
        self.arms = ['hpo', 'fe']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        self.inc_record = {'fe': list(), 'hpo': list()}
        self.exp_output = dict()
        self.eval_dict = {'fe': dict(), 'hpo': dict()}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
            self.exp_output[arm] = dict()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_config = None
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.first_start = True

        self.include_text = True if TEXT in self.original_data.feature_types else False
        self.include_image = True if IMAGE in self.original_data.feature_types else False

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from solnml.components.models.classification import _classifiers, _addons
            _candidates = get_combined_candidtates(_classifiers, _addons)
            if estimator_id in _candidates:
                clf_class = _candidates[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
        elif self.task_type in RGS_TASKS:
            from solnml.components.models.regression import _regressors, _addons
            _candidates = get_combined_candidtates(_regressors, _addons)
            if estimator_id in _candidates:
                reg_class = _candidates[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        self.if_imbal = is_imbalanced_dataset(self.original_data)

        self.fe_config_space = get_task_hyperparameter_space(
            self.task_type,
            self.estimator_id,
            include_preprocessors=self.include_preprocessors,
            include_text=self.include_text,
            include_image=self.include_image,
            if_imbal=self.if_imbal)
        self.fe_default_config = self.fe_config_space.get_default_configuration(
        )

        self.timestamp = timestamp
        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(
                self.default_config,
                self.fe_default_config,
                estimator_id,
                scorer=self.metric,
                data_node=self.original_data,
                name='fe',
                resampling_strategy=self.evaluation_type,
                if_imbal=self.if_imbal,
                seed=self.seed,
                output_dir=self.output_dir,
                timestamp=self.timestamp)
            hpo_evaluator = ClassificationEvaluator(
                self.default_config,
                self.fe_default_config,
                estimator_id,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                if_imbal=self.if_imbal,
                seed=self.seed,
                output_dir=self.output_dir,
                timestamp=self.timestamp)

        elif self.task_type in RGS_TASKS:
            fe_evaluator = RegressionEvaluator(
                self.default_config,
                self.fe_default_config,
                estimator_id,
                scorer=self.metric,
                data_node=self.original_data,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed,
                output_dir=self.output_dir,
                timestamp=self.timestamp)
            hpo_evaluator = RegressionEvaluator(
                self.default_config,
                self.fe_default_config,
                estimator_id,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed,
                output_dir=self.output_dir,
                timestamp=self.timestamp)
        else:
            raise ValueError('Invalid task type!')

        if self.mth != 'combined':
            self.enable_fe = enable_fe
            trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

            self.optimizer['fe'] = build_fe_optimizer(
                self.evaluation_type,
                fe_evaluator,
                self.fe_config_space,
                per_run_time_limit=per_run_time_limit,
                per_run_mem_limit=per_run_mem_limit,
                inner_iter_num_per_iter=trials_per_iter,
                output_dir=output_dir,
                seed=self.seed,
                n_jobs=n_jobs)

            self.inc['fe'], self.local_inc[
                'fe'] = self.fe_default_config, self.fe_default_config

            # Build the HPO component.
            # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
            trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

            self.optimizer['hpo'] = build_hpo_optimizer(
                self.evaluation_type,
                hpo_evaluator,
                cs,
                output_dir=output_dir,
                per_run_time_limit=per_run_time_limit,
                inner_iter_num_per_iter=trials_per_iter,
                seed=self.seed,
                n_jobs=n_jobs)

            self.inc['hpo'], self.local_inc[
                'hpo'] = self.default_config, self.default_config
            self.init_config = cs.get_default_configuration()
            self.local_hist['fe'].append(self.fe_default_config)
            self.local_hist['hpo'].append(self.default_config)

        else:
            self.rewards = list()
            self.evaluation_cost = list()
            self.eval_dict = {}
            trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

            if self.task_type in CLS_TASKS:
                from solnml.utils.combined_cls_evaluator import get_combined_cs
                from solnml.utils.combined_cls_evaluator import CombinedClassificationEvaluator as CombinedEvaluator
            else:
                from solnml.utils.combined_rgs_evaluator import get_combined_cs
                from solnml.utils.combined_rgs_evaluator import CombinedRegressionEvaluator as CombinedEvaluator

            self.evaluator = CombinedEvaluator(
                estimator_id,
                scorer=self.metric,
                data_node=self.original_data,
                if_imbal=self.if_imbal,
                timestamp=self.timestamp,
                output_dir=self.output_dir,
                resampling_strategy=self.evaluation_type)
            cs = get_combined_cs(
                self.estimator_id,
                self.task_type,
                include_image=self.include_image,
                include_text=self.include_text,
                include_preprocessors=self.include_preprocessors,
                if_imbal=self.if_imbal)

            self.optimizer = build_hpo_optimizer(
                self.evaluation_type,
                self.evaluator,
                cs,
                output_dir=self.output_dir,
                per_run_time_limit=self.per_run_time_limit,
                inner_iter_num_per_iter=trials_per_iter,
                seed=self.seed,
                n_jobs=self.n_jobs)
Beispiel #24
0
    def optimize(self):
        if self.inner_opt_algorithm in ['rb_hpo', 'fixed', 'alter_hpo', 'alter', 'combined']:
            self.optimize_explore_first()
        elif self.inner_opt_algorithm == 'equal':
            self.optimize_equal_resource()
        else:
            raise ValueError('Unsupported optimization method: %s!' % self.inner_opt_algorithm)

        scores = list()
        for _arm in self.arms:
            scores.append(self.sub_bandits[_arm].incumbent_perf)
        scores = np.array(scores)
        algo_idx = np.argmax(scores)
        self.optimal_algo_id = self.arms[algo_idx]
        self.incumbent_perf = scores[algo_idx]
        _threshold, _ensemble_size = self.incumbent_perf * 0.90, 5
        if self.incumbent_perf < 0.:
            _threshold = self.incumbent_perf / 0.9

        idxs = np.argsort(-scores)[:_ensemble_size]
        _algo_ids = [self.arms[idx] for idx in idxs]
        self.nbest_algo_ids = list()
        for _idx, _arm in zip(idxs, _algo_ids):
            if scores[_idx] >= _threshold:
                self.nbest_algo_ids.append(_arm)
        assert len(self.nbest_algo_ids) > 0

        self.logger.info('=' * 50)
        self.logger.info('Best_algo_perf:  %s' % str(self.incumbent_perf))
        self.logger.info('Best_algo_id:    %s' % str(self.optimal_algo_id))
        self.logger.info('Nbest_algo_ids:  %s' % str(self.nbest_algo_ids))
        self.logger.info('Arm candidates:  %s' % str(self.arms))
        self.logger.info('Best val scores: %s' % str(list(scores)))
        self.logger.info('=' * 50)

        if self.inner_opt_algorithm == 'combined':
            tmp_evaluator = ClassificationEvaluator(None)
            # A tmp optimizer for recording fe transformations
            self.tmp_bo = AnotherBayesianOptimizationOptimizer(0, self.original_data, tmp_evaluator, 'adaboost',
                                                               1, 1, 1)

            # Fit the best mode
            best_config = self.sub_bandits[self.optimal_algo_id].incumbent_config
            self.best_node = self.tmp_bo.fetch_nodes_by_config([best_config])[0]
            best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_node.data[0],
                                                     self.best_node.data[1],
                                                     weight_balance=self.best_node.enable_balance,
                                                     data_balance=self.best_node.data_balance,
                                                     combined=True)
        else:
            # Fit the best model
            self.fe_optimizer = self.sub_bandits[self.optimal_algo_id].optimizer['fe']
            if self.fe_algo == 'bo':
                self.fe_optimizer.fetch_nodes(1)

            best_config = self.sub_bandits[self.optimal_algo_id].inc['hpo']
            best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_data_node.data[0],
                                                     self.best_data_node.data[1],
                                                     weight_balance=self.best_data_node.enable_balance,
                                                     data_balance=self.best_data_node.data_balance)

        with open(os.path.join(self.output_dir, '%s-best_model' % self.timestamp), 'wb') as f:
            pkl.dump(best_estimator, f)

        if self.ensemble_method is not None:
            if self.inner_opt_algorithm == 'combined':
                eval_dict = {key: self.sub_bandits[key].eval_dict for key in self.include_algorithms}
                stats = fetch_ensemble_members(self.nbest_algo_ids, self.seed, eval_dict, self.tmp_bo)
                from solnml.components.ensemble.combined_ensemble.ensemble_bulider import EnsembleBuilder
            else:
                # stats = self.fetch_ensemble_members_ano()
                stats = self.fetch_ensemble_members()

                from solnml.components.ensemble import EnsembleBuilder

            # Ensembling all intermediate/ultimate models found in above optimization process.
            self.es = EnsembleBuilder(stats=stats,
                                      ensemble_method=self.ensemble_method,
                                      ensemble_size=self.ensemble_size,
                                      task_type=self.task_type,
                                      metric=self.metric,
                                      output_dir=self.output_dir)
            self.es.fit(data=self.original_data)
Beispiel #25
0
        cs = get_combined_cs(estimator_id)
        op[estimator_id] = SMACOptimizer(evaluator,
                                         cs,
                                         inner_iter_num_per_iter=10)

    # Iterate (modify search strategy here)
    for estimator_id in estimator_ids:
        op[estimator_id].iterate()

    # Fetch ensemble members
    eval_dict = dict()
    for estimator_id in estimator_ids:
        eval_dict[estimator_id] = op[estimator_id].eval_dict
    # Important: Specify n_best ids according to search strategy
    nbest_ids = estimator_ids
    tmp_evaluator = ClassificationEvaluator(None)
    tmp_bo = BayesianOptimizationOptimizer(
        0, train_node, tmp_evaluator, 'adaboost', 1, 1,
        1)  # A tmp optimizer for recording fe transformations
    stats = fetch_ensemble_members(nbest_ids,
                                   seed,
                                   eval_dict,
                                   record_op=tmp_bo)

    es = EnsembleBuilder(stats,
                         'ensemble_selection',
                         50,
                         task_type=0,
                         metric=balanced_accuracy_scorer,
                         output_dir='logs/')
    es.fit(train_node)
def evaluate_2rd_bandit(dataset, algo, time_limit, run_id, seed):
    print('HMAB-%s-%s: run_id=%d' % (dataset, algo, run_id))
    print('==> Start to Evaluate', dataset, 'Budget', time_limit)
    train_data, test_data = load_train_test_data(dataset)
    enable_intersect = True
    bandit = SecondLayerBandit(algo,
                               train_data,
                               per_run_time_limit=300,
                               seed=seed,
                               eval_type='holdout',
                               mth='alter_hpo',
                               enable_intersection=enable_intersect)
    mth_id = 'hmab' if enable_intersect else 'hmab0'
    _start_time = time.time()
    _iter_id = 0
    stats = list()

    while True:
        if time.time() > time_limit + _start_time or bandit.early_stopped_flag:
            break
        res = bandit.play_once()
        print('Iteration %d - %.4f' % (_iter_id, res))
        stats.append([_iter_id, time.time() - _start_time, res])
        _iter_id += 1

    print(bandit.final_rewards)
    print(bandit.action_sequence)
    print(np.mean(bandit.evaluation_cost['fe']))
    print(np.mean(bandit.evaluation_cost['hpo']))

    fe_optimizer = bandit.optimizer['fe']
    final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe'])
    assert final_train_data == bandit.inc['fe']
    final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe'])
    config = bandit.inc['hpo']

    evaluator = ClassificationEvaluator(config,
                                        name='fe',
                                        seed=seed,
                                        resampling_strategy='holdout')
    val_score = evaluator(None, data_node=final_train_data)
    print('==> Best validation score', val_score, res)

    X_train, y_train = final_train_data.data
    clf = fetch_predict_estimator(config, X_train, y_train)
    X_test, y_test = final_test_data.data
    y_pred = clf.predict(X_test)
    test_score = balanced_accuracy(y_test, y_pred)
    print('==> Test score', test_score)

    # Alleviate overfitting.
    y_pred1 = bandit.predict(test_data.data[0])
    test_score1 = balanced_accuracy(y_test, y_pred1)
    print('==> Test score with average ensemble', test_score1)

    y_pred2 = bandit.predict(test_data.data[0], is_weighted=True)
    test_score2 = balanced_accuracy(y_test, y_pred2)
    print('==> Test score with weighted ensemble', test_score2)

    save_path = save_dir + '%s_2rd_bandit_%s_%d_%d_%s.pkl' % (
        mth_id, dataset, time_limit, run_id, algo)
    with open(save_path, 'wb') as f:
        pickle.dump([dataset, val_score, test_score, test_score1, test_score2],
                    f)
Beispiel #27
0
def evaluate(mode, dataset, run_id, metric):
    print(mode, dataset, run_id, metric)

    metric = get_metric(metric)
    train_data, test_data = load_train_test_data(dataset,
                                                 task_type=MULTICLASS_CLS)

    cs = _classifiers[algo_name].get_hyperparameter_search_space()
    model = UnParametrizedHyperparameter("estimator", algo_name)
    cs.add_hyperparameter(model)
    default_hpo_config = cs.get_default_configuration()

    fe_evaluator = ClassificationEvaluator(default_hpo_config,
                                           scorer=metric,
                                           name='fe',
                                           resampling_strategy='holdout',
                                           seed=1)

    hpo_evaluator = ClassificationEvaluator(default_hpo_config,
                                            scorer=metric,
                                            data_node=train_data,
                                            name='hpo',
                                            resampling_strategy='holdout',
                                            seed=1)

    fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION,
                                                 input_data=train_data,
                                                 evaluator=fe_evaluator,
                                                 model_id=algo_name,
                                                 time_limit_per_trans=600,
                                                 mem_limit_per_trans=5120,
                                                 number_of_unit_resource=10,
                                                 seed=1)

    def objective_function(config):
        if benchmark == 'fe':
            return fe_optimizer.evaluate_function(config)
        else:
            return hpo_evaluator(config)

    if mode == 'bo':
        bo = BO(objective_function,
                config_space,
                max_runs=max_runs,
                surrogate_model='prob_rf')
        bo.run()
        print('BO result')
        print(bo.get_incumbent())
        perf = bo.history_container.incumbent_value
        runs = [bo.configurations, bo.perfs]
    elif mode == 'lite_bo':
        from litebo.facade.bo_facade import BayesianOptimization
        bo = BayesianOptimization(objective_function,
                                  config_space,
                                  max_runs=max_runs)
        bo.run()
        print('BO result')
        print(bo.get_incumbent())
        perf = bo.history_container.incumbent_value
        runs = [bo.configurations, bo.perfs]
    elif mode.startswith('tlbo'):
        _, gp_fusion = mode.split('_')
        meta_feature_vec = metafeature_dict[dataset]
        past_datasets = test_datasets.copy()
        if dataset in past_datasets:
            past_datasets.remove(dataset)
        past_history = load_runhistory(past_datasets)

        gp_models = [
            gp_models_dict[dataset_name] for dataset_name in past_datasets
        ]
        tlbo = TLBO(objective_function,
                    config_space,
                    past_history,
                    gp_models=gp_models,
                    dataset_metafeature=meta_feature_vec,
                    max_runs=max_runs,
                    gp_fusion=gp_fusion)
        tlbo.run()
        print('TLBO result')
        print(tlbo.get_incumbent())
        runs = [tlbo.configurations, tlbo.perfs]
        perf = tlbo.history_container.incumbent_value
    else:
        raise ValueError('Invalid mode.')
    file_saved = '%s_%s_%s_result_%d_%d_%s.pkl' % (mode, algo_name, dataset,
                                                   max_runs, run_id, benchmark)
    with open(data_dir + file_saved, 'wb') as f:
        pk.dump([perf, runs], f)
Beispiel #28
0
    def prepare_optimizer(self, _arm):
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource
        if _arm == 'fe':
            # Build the Feature Engineering component.
            self.original_data._node_id = -1
            inc_hpo = copy.deepcopy(self.inc['hpo'])
            if self.task_type in CLS_TASKS:
                fe_evaluator = ClassificationEvaluator(
                    inc_hpo,
                    self.fe_default_config,
                    self.estimator_id,
                    data_node=self.original_data,
                    scorer=self.metric,
                    name='fe',
                    resampling_strategy=self.evaluation_type,
                    if_imbal=self.if_imbal,
                    seed=self.seed,
                    output_dir=self.output_dir,
                    timestamp=self.timestamp)
            elif self.task_type in RGS_TASKS:
                fe_evaluator = RegressionEvaluator(
                    inc_hpo,
                    self.fe_default_config,
                    self.estimator_id,
                    data_node=self.original_data,
                    scorer=self.metric,
                    name='fe',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed,
                    output_dir=self.output_dir,
                    timestamp=self.timestamp)
            else:
                raise ValueError('Invalid task type!')
            self.optimizer[_arm] = build_fe_optimizer(
                self.evaluation_type,
                fe_evaluator,
                self.fe_config_space,
                per_run_time_limit=self.per_run_time_limit,
                per_run_mem_limit=self.per_run_mem_limit,
                inner_iter_num_per_iter=trials_per_iter,
                output_dir=self.output_dir,
                seed=self.seed,
                n_jobs=self.n_jobs)
        else:
            # trials_per_iter = self.optimizer['fe'].evaluation_num_last_iteration // 2
            # trials_per_iter = max(20, trials_per_iter)
            inc_fe = copy.deepcopy(self.inc['fe'])
            if self.task_type in CLS_TASKS:
                hpo_evaluator = ClassificationEvaluator(
                    self.default_config,
                    inc_fe,
                    self.estimator_id,
                    scorer=self.metric,
                    data_node=self.original_data,
                    name='hpo',
                    resampling_strategy=self.evaluation_type,
                    if_imbal=self.if_imbal,
                    seed=self.seed,
                    output_dir=self.output_dir,
                    timestamp=self.timestamp)
            elif self.task_type in RGS_TASKS:
                hpo_evaluator = RegressionEvaluator(
                    self.default_config,
                    inc_fe,
                    self.estimator_id,
                    scorer=self.metric,
                    data_node=self.original_data,
                    name='hpo',
                    resampling_strategy=self.evaluation_type,
                    seed=self.seed,
                    output_dir=self.output_dir,
                    timestamp=self.timestamp)
            else:
                raise ValueError('Invalid task type!')

            self.optimizer[_arm] = build_hpo_optimizer(
                self.evaluation_type,
                hpo_evaluator,
                self.config_space,
                output_dir=self.output_dir,
                per_run_time_limit=self.per_run_time_limit,
                inner_iter_num_per_iter=trials_per_iter,
                seed=self.seed)

        self.logger.debug('=' * 30)
        self.logger.debug('UPDATE OPTIMIZER: %s' % _arm)
        self.logger.debug('=' * 30)