def evaluate_joint_solution(self): # Update join incumbent from FE and HPO. _perf = None try: with time_limit(600): if self.task_type in CLS_TASKS: _perf = ClassificationEvaluator( self.local_inc['hpo'], data_node=self.local_inc['fe'], scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed)(self.local_inc['hpo']) else: _perf = RegressionEvaluator( self.local_inc['hpo'], data_node=self.local_inc['fe'], scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed)(self.local_inc['hpo']) except Exception as e: self.logger.error(str(e)) # Update INC. if _perf is not None and np.isfinite( _perf) and _perf > self.incumbent_perf: self.inc['hpo'] = self.local_inc['hpo'] self.inc['fe'] = self.local_inc['fe'] self.incumbent_perf = _perf
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, iter_mode=True): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if not iter_mode: optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=600, output_dir='logs') inc, val = optimizer.optimize() print(inc, val) else: import time _start_time = time.time() optimizer = SMACOptimizer( evaluator, cs, trials_per_iter=1, output_dir='logs', per_run_time_limit=180 ) results = list() for _iter in range(iter_num): perf, _, _ = optimizer.iterate() print(_iter, perf) results.append(perf) print(results) print(time.time() - _start_time)
def get_fe_cs(estimator_id, node, task_type=0): tmp_evaluator = ClassificationEvaluator(None) tmp_bo = AnotherBayesianOptimizationOptimizer(task_type, node, tmp_evaluator, estimator_id, 1, 1, 1) cs = tmp_bo._get_task_hyperparameter_space('smac') return cs
def evaluate_ml_algorithm(dataset, algo, obj_metric, seed=1, task_type=None): print('EVALUATE-%s-%s-%s' % (dataset, algo, obj_metric)) train_data = load_data(dataset, task_type=task_type, datanode_returned=True) print(set(train_data.data[1])) metric = get_metric(obj_metric) cs = _classifiers[algo].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=seed) hpo_optimizer = SMACOptimizer(evaluator=hpo_evaluator, config_space=cs, per_run_time_limit=600, per_run_mem_limit=5120, output_dir='./logs', trials_per_iter=args.iter) hpo_optimizer.iterate() hpo_eval_dict = dict() for key, value in hpo_optimizer.eval_dict.items(): hpo_eval_dict[key[1]] = value save_path = save_dir + '%s-%s-%s-hpo.pkl' % (dataset, algo, obj_metric) with open(save_path, 'wb') as f: pickle.dump(hpo_eval_dict, f)
def get_configspace(): if benchmark == 'hpo': cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) return cs train_data, test_data = load_train_test_data('splice', task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) hyper_space = fe_optimizer.hyperparameter_space return hyper_space
def evaluate(_config): _config = _config.get_dictionary() # print(_config) arm = None cs = ConfigurationSpace() for key in _config: key_str = key.split(":") if key_str[0] == 'classifier': if key_str[1] == '__choice__': arm = _config[key] cs.add_hyperparameter(UnParametrizedHyperparameter("estimator", _config[key])) else: cs.add_hyperparameter(UnParametrizedHyperparameter(key_str[2], _config[key])) if arm in first_bandit.arms: transformed_node = apply_metalearning_fe(first_bandit.sub_bandits[arm].optimizer['fe'], _config) default_config = cs.sample_configuration(1) hpo_evaluator = ClassificationEvaluator(None, data_node=transformed_node, name='hpo', resampling_strategy=first_bandit.eval_type, seed=first_bandit.seed) start_time = time.time() score1 = 1 - hpo_evaluator(default_config) time_cost1 = time.time() - start_time # Evaluate the default config start_time = time.time() score2 = 1 - hpo_evaluator(first_bandit.sub_bandits[arm].default_config) time_cost2 = time.time() - start_time transformed_node.score2 = max(score1, score2) return (arm, score1, default_config, transformed_node, time_cost1), ( arm, score2, first_bandit.sub_bandits[arm].default_config, transformed_node, time_cost2)
def evaluate_bo_optimizer(dataset, time_limit, run_id, seed): from solnml.components.fe_optimizers.bo_optimizer import BayesianOptimizationOptimizer # Prepare the configuration for random forest. from ConfigSpace.hyperparameters import UnParametrizedHyperparameter from autosklearn.pipeline.components.classification.random_forest import RandomForest cs = RandomForest.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest') cs.add_hyperparameter(clf_hp) print(cs.get_default_configuration()) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='fe', seed=seed, resampling_strategy='holdout') train_data, test_data = load_train_test_data(dataset) cls_task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS optimizer = BayesianOptimizationOptimizer(cls_task_type, train_data, evaluator, 'random_forest', 300, 10000, seed, time_budget=time_limit) optimizer.optimize() inc = optimizer.incumbent_config val_score = 1 - optimizer.evaluate_function(inc) print(val_score) print(optimizer.incumbent_score) optimizer.fetch_nodes(n=10) print("Refit finished!") final_train_data = optimizer.apply(train_data, optimizer.incumbent, phase='train') X_train, y_train = final_train_data.data final_test_data = optimizer.apply(test_data, optimizer.incumbent) X_test, y_test = final_test_data.data clf = fetch_predict_estimator( cls_task_type, cs.get_default_configuration(), X_train, y_train, weight_balance=final_train_data.enable_balance, data_balance=final_train_data.data_balance) y_pred = clf.predict(X_test) from solnml.components.metrics.cls_metrics import balanced_accuracy test_score = balanced_accuracy(y_test, y_pred) print('==> Test score', test_score) save_path = save_dir + 'bo_fe_%s_%d_%d.pkl' % (dataset, time_limit, run_id) with open(save_path, 'wb') as f: pickle.dump([dataset, val_score, test_score], f)
def evaluate_joint_solution(self): # Update join incumbent from FE and HPO. _perf = None try: with time_limit(self.per_run_time_limit): if self.task_type in CLS_TASKS: evaluator = ClassificationEvaluator( self.local_inc['hpo'], self.local_inc['fe'], self.estimator_id, data_node=self.original_data, scorer=self.metric, if_imbal=self.if_imbal, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) else: evaluator = RegressionEvaluator( self.local_inc['hpo'], self.local_inc['fe'], self.estimator_id, data_node=self.original_data, scorer=self.metric, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) _perf = -evaluator(self.local_inc['hpo']) except Exception as e: self.logger.error(str(e)) # TODO: Need refactoring! sorted_list_path = evaluator.topk_model_saver.sorted_list_path path_list = os.path.split(sorted_list_path) tmp_path = 'tmp_' + path_list[-1] tmp_filepath = os.path.join(os.path.dirname(sorted_list_path), tmp_path) # TODO: How to merge when using multi-process if os.path.exists(tmp_filepath): self.logger.info('Temporary config path detected!') with open(tmp_filepath, 'rb') as f1: sorted_file_replica = pkl.load(f1) with open(sorted_list_path, 'wb') as f2: pkl.dump(sorted_file_replica, f2) self.logger.info('Temporary config path merged!') # Update INC. if _perf is not None and np.isfinite( _perf) and _perf > self.incumbent_perf: self.inc['hpo'] = self.local_inc['hpo'] self.inc['fe'] = self.local_inc['fe'] self.incumbent_perf = _perf
def conduct_fe(dataset='pc4', classifier_id='random_forest', iter_num=100, run_id=0, seed=1): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) default_config = cs.get_default_configuration() raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed) evaluator = ClassificationEvaluator(default_config, name='fe', data_node=raw_data, resampling_strategy='holdout', seed=seed) val_acc = evaluator(default_config) estimator = fetch_predict_estimator(default_config, raw_data.data[0], raw_data.data[1]) pred = estimator.predict(test_raw_data.data[0]) test_acc = balanced_accuracy(test_raw_data.data[1], pred) optimizer = EvaluationBasedOptimizer(task_type='classification', input_data=raw_data, evaluator=evaluator, model_id=classifier_id, time_limit_per_trans=240, mem_limit_per_trans=10000, seed=seed) task_id = 'fe-%s-%s-%d' % (dataset, classifier_id, iter_num) val_acc_list, test_acc_list = [], [] val_acc_list.append(val_acc) test_acc_list.append(test_acc) for _iter in range(iter_num): perf, _, incubent = optimizer.iterate() val_acc_list.append(perf) train_node = optimizer.apply(raw_data, incubent) test_node = optimizer.apply(test_raw_data, incubent) estimator = fetch_predict_estimator(default_config, train_node.data[0], train_node.data[1]) pred = estimator.predict(test_node.data[0]) test_perf = balanced_accuracy(test_node.data[1], pred) test_acc_list.append(test_perf) print(val_acc_list) print(test_acc_list) save_path = save_dir + '%s-%d.pkl' % (task_id, run_id) with open(save_path, 'wb') as f: pickle.dump([val_acc_list, test_acc_list], f)
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, run_id=0, seed=1): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data, test_raw_data = load_train_test_data(dataset, random_state=seed) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data, resampling_strategy='holdout', seed=seed) default_config = cs.get_default_configuration() val_acc = 1. - evaluator(default_config) estimator = fetch_predict_estimator(default_config, raw_data.data[0], raw_data.data[1]) pred = estimator.predict(test_raw_data.data[0]) test_acc = balanced_accuracy(test_raw_data.data[1], pred) optimizer = SMACOptimizer(evaluator, cs, trials_per_iter=2, output_dir='logs', per_run_time_limit=180) task_id = 'hpo-%s-%s-%d' % (dataset, classifier_id, iter_num) val_acc_list, test_acc_list = [], [] val_acc_list.append(val_acc) test_acc_list.append(test_acc) for _iter in range(iter_num): perf, _, config = optimizer.iterate() val_acc_list.append(perf) estimator = fetch_predict_estimator(config, raw_data.data[0], raw_data.data[1]) pred = estimator.predict(test_raw_data.data[0]) test_perf = balanced_accuracy(test_raw_data.data[1], pred) test_acc_list.append(test_perf) print(val_acc_list) print(test_acc_list) save_path = save_dir + '%s-%d.pkl' % (task_id, run_id) with open(save_path, 'wb') as f: pickle.dump([val_acc_list, test_acc_list], f)
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc', cv='holdout', time_limit=120000, seed=1): train_data, test_data = load_train_test_data(dataset) bandit = SecondLayerBandit(algo, train_data, dataset_id=dataset, mth=mth, seed=seed, eval_type=cv) _start_time = time.time() _iter_id = 0 stats = list() while True: if time.time() > time_limit + _start_time or bandit.early_stopped_flag: break res = bandit.play_once() print('Iteration %d - %.4f' % (_iter_id, res)) stats.append([_iter_id, time.time() - _start_time, res]) _iter_id += 1 print(bandit.final_rewards) print(bandit.action_sequence) print(np.mean(bandit.evaluation_cost['fe'])) print(np.mean(bandit.evaluation_cost['hpo'])) fe_optimizer = bandit.optimizer['fe'] final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe']) assert final_train_data == bandit.inc['fe'] final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe']) config = bandit.inc['hpo'] evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout') val_score = evaluator(None, data_node=final_train_data) print('==> Best validation score', val_score, res) X_train, y_train = final_train_data.data clf = fetch_predict_estimator(config, X_train, y_train) X_test, y_test = final_test_data.data y_pred = clf.predict(X_test) test_score = balanced_accuracy(y_test, y_pred) print('==> Test score', test_score) # Alleviate overfitting. y_pred1 = bandit.predict(test_data.data[0]) test_score1 = balanced_accuracy(y_test, y_pred1) print('==> Test score with average ensemble', test_score1) y_pred2 = bandit.predict(test_data.data[0], is_weighted=True) test_score2 = balanced_accuracy(y_test, y_pred2) print('==> Test score with weighted ensemble', test_score2) save_path = save_folder + '%s_%s_%d_%d_%s.pkl' % (mth, dataset, time_limit, run_id, algo) with open(save_path, 'wb') as f: pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
def conduct_hpo(optimizer='smac', dataset='pc4', classifier_id='random_forest', runcount_limit=100): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if optimizer == 'smac': optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=runcount_limit, output_dir='logs') elif optimizer == 'psmac': optimizer = PSMACOptimizer(evaluator, cs, args.n, evaluation_limit=runcount_limit, output_dir='logs', trials_per_iter=args.trial) perf, cost, config = optimizer.iterate() print(perf, cost, config) perf, cost, config = optimizer.iterate() print(perf, cost, config)
def __init__(self, scorer=None, data_node=None, task_type=0, resampling_strategy='cv', resampling_params=None, seed=1): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.data_node = data_node self.seed = seed self.eval_id = 0 self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.continue_training = False tmp_evaluator = ClassificationEvaluator(None) self.tmp_bo = AnotherBayesianOptimizationOptimizer( task_type, data_node, tmp_evaluator, 'adaboost', 1, 1, 1)
def evaluate_fe_bugs(dataset, run_id, time_limit, seed): algorithms = [ 'lda', 'k_nearest_neighbors', 'libsvm_svc', 'sgd', 'adaboost', 'random_forest', 'extra_trees', 'decision_tree' ] algo_id = np.random.choice(algorithms, 1)[0] task_id = '%s-fe-%s-%d' % (dataset, algo_id, run_id) print(task_id) # Prepare the configuration for random forest. clf_class = _classifiers[algo_id] cs = clf_class.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", algo_id) cs.add_hyperparameter(clf_hp) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='fe', seed=seed, resampling_strategy='holdout') pipeline = FEPipeline(fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id=algo_id, time_limit_per_trans=per_run_time_limit, task_id=task_id) raw_data, test_raw_data = load_train_test_data(dataset) train_data = pipeline.fit_transform(raw_data.copy_()) test_data = pipeline.transform(test_raw_data.copy_()) train_data_new = pipeline.transform(raw_data.copy_()) assert (train_data.data[0] == train_data_new.data[0]).all() assert (train_data.data[1] == train_data_new.data[1]).all() assert (train_data_new == train_data) score = evaluator(None, data_node=test_data) print('==> Test score', score)
def evaluate(dataset): train_data, test_data = load_train_test_data(dataset, test_size=0.3, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() default_hpo_config = cs.get_default_configuration() metric = get_metric('bal_acc') fe_cs = get_task_hyperparameter_space(0, algo_name) default_fe_config = fe_cs.get_default_configuration() evaluator = ClassificationEvaluator(default_hpo_config, default_fe_config, algo_name, data_node=train_data, scorer=metric, name='hpo', resampling_strategy='holdout', output_dir='./data/exp_sys', seed=1) from solnml.components.optimizers.tlbo_optimizer import TlboOptimizer optimizer = TlboOptimizer(evaluator, cs, time_limit=300, name='hpo') optimizer.run()
def evaluate(mode, dataset, run_id, metric): print(mode, dataset, run_id, metric) metric = get_metric(metric) train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) def objective_function(config): if benchmark == 'fe': return fe_optimizer.evaluate_function(config) else: return hpo_evaluator(config) meta_feature_vec = metafeature_dict[dataset] past_datasets = test_datasets.copy() if dataset in past_datasets: past_datasets.remove(dataset) past_history = load_runhistory(past_datasets) tlbo = TLBO_AF(objective_function, config_space, past_history, dataset_metafeature=meta_feature_vec, max_runs=max_runs, acq_method='taff2') tlbo.run() print('TLBO result') print(tlbo.get_incumbent()) runs = [tlbo.configurations, tlbo.perfs] perf = tlbo.history_container.incumbent_value file_saved = '%s_%s_result_%d_%d_%s.pkl' % (mode, dataset, max_runs, run_id, benchmark) with open(data_dir + file_saved, 'wb') as f: pk.dump([perf, runs], f)
def evaluate_evaluation_based_fe(dataset, time_limit, run_id, seed): from solnml.components.fe_optimizers.evaluation_based_optimizer import EvaluationBasedOptimizer # Prepare the configuration for random forest. from ConfigSpace.hyperparameters import UnParametrizedHyperparameter from autosklearn.pipeline.components.classification.random_forest import RandomForest cs = RandomForest.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest') cs.add_hyperparameter(clf_hp) print(cs.get_default_configuration()) """ Configuration: bootstrap, Value: 'True' criterion, Value: 'gini' estimator, Constant: 'random_forest' max_depth, Constant: 'None' max_features, Value: 0.5 max_leaf_nodes, Constant: 'None' min_impurity_decrease, Constant: 0.0 min_samples_leaf, Value: 1 min_samples_split, Value: 2 min_weight_fraction_leaf, Constant: 0.0 n_estimators, Constant: 100 """ evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='fe', seed=seed, resampling_strategy='holdout') train_data, test_data = load_train_test_data(dataset) optimizer = EvaluationBasedOptimizer(MULTICLASS_CLS, train_data, evaluator, 'random_forest', 300, 10000, seed, trans_set=None) _start_time = time.time() _iter_id = 0 while True: if time.time( ) > _start_time + time_limit or optimizer.early_stopped_flag: break score, iteration_cost, inc = optimizer.iterate() print('%d - %.4f' % (_iter_id, score)) _iter_id += 1 final_train_data = optimizer.apply(train_data, optimizer.incumbent) val_score = evaluator(None, data_node=final_train_data) print('==> Best validation score', val_score, score) final_test_data = optimizer.apply(test_data, optimizer.incumbent) X_train, y_train = final_train_data.data clf = fetch_predict_estimator(MULTICLASS_CLS, cs.get_default_configuration(), X_train, y_train) X_test, y_test = final_test_data.data y_pred = clf.predict(X_test) from solnml.components.metrics.cls_metrics import balanced_accuracy test_score = balanced_accuracy(y_test, y_pred) print('==> Test score', test_score) save_path = save_dir + 'hmab_fe_%s_%d_%d.pkl' % (dataset, time_limit, run_id) with open(save_path, 'wb') as f: pickle.dump([dataset, val_score, test_score], f)
def evaluate_joint_perf(self): # Update join incumbent from FE and HPO. _perf = None try: with time_limit(self.per_run_time_limit): if self.task_type in CLS_TASKS: from solnml.components.evaluators.cls_evaluator import ClassificationEvaluator evaluator = ClassificationEvaluator( self.local_inc['fe'].copy(), scorer=self.metric, data_node=self.original_data, if_imbal=self.if_imbal, timestamp=self.timestamp, seed=self.seed, output_dir=self.output_dir, resampling_strategy=self.eval_type, resampling_params=self.resampling_params) else: from solnml.components.evaluators.rgs_evaluator import RegressionEvaluator evaluator = RegressionEvaluator( self.local_inc['fe'].copy(), scorer=self.metric, data_node=self.original_data, timestamp=self.timestamp, seed=self.seed, output_dir=self.output_dir, resampling_strategy=self.eval_type, resampling_params=self.resampling_params) _perf = -evaluator(self.local_inc['hpo'].copy()) except Exception as e: self.logger.error(str(e)) if _perf is not None and np.isfinite(_perf): _config = self.local_inc['fe'].copy() _config.update(self.local_inc['hpo'].copy()) classifier_id = _config['algorithm'] # -perf: The larger, the better. save_flag, model_path, delete_flag, model_path_deleted = self.topk_saver.add( _config, -_perf, classifier_id) # By default, the evaluator has already stored the models. if self.eval_type in ['holdout', 'partial']: if save_flag: pass else: os.remove(model_path) self.logger.info("Model deleted from %s" % model_path) try: if delete_flag: os.remove(model_path_deleted) self.logger.info("Model deleted from %s" % model_path_deleted) else: pass except: pass self.eval_dict[(self.local_inc['fe'].copy(), self.local_inc['hpo'].copy())] = [ _perf, time.time(), SUCCESS ] self.topk_saver.save_topk_config() else: self.eval_dict[(self.local_inc['fe'].copy(), self.local_inc['hpo'].copy())] = [ _perf, time.time(), FAILED ] # Update INC. if _perf is not None and np.isfinite( _perf) and _perf > self.incumbent_perf: self.inc['hpo'] = self.local_inc['hpo'] self.inc['fe'] = self.local_inc['fe'] self.incumbent_perf = _perf _incumbent = dict() _incumbent.update(self.inc['fe']) _incumbent.update(self.inc['hpo']) self.incumbent = _incumbent.copy()
def evaluate(mth, dataset, run_id): print(mth, dataset, run_id) train_data, test_data = load_train_test_data(dataset, test_size=0.3, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() metric = get_metric('bal_acc') fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=MULTICLASS_CLS, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) config_space = fe_optimizer.hyperparameter_space def objective_function(config): return fe_optimizer.evaluate_function(config) if mth == 'gp_bo': bo = BO(objective_function, config_space, max_runs=max_runs) bo.run() print('new BO result') print(bo.get_incumbent()) perf_bo = bo.history_container.incumbent_value elif mth == 'lite_bo': from litebo.facade.bo_facade import BayesianOptimization bo = BayesianOptimization(objective_function, config_space, max_runs=max_runs) bo.run() print('lite BO result') print(bo.get_incumbent()) perf_bo = bo.history_container.incumbent_value elif mth == 'smac': from smac.scenario.scenario import Scenario from smac.facade.smac_facade import SMAC # Scenario object scenario = Scenario({ "run_obj": "quality", "runcount-limit": max_runs, "cs": config_space, "deterministic": "true" }) smac = SMAC(scenario=scenario, rng=np.random.RandomState(42), tae_runner=objective_function) incumbent = smac.optimize() perf_bo = objective_function(incumbent) print('SMAC BO result') print(perf_bo) else: raise ValueError('Invalid method.') return perf_bo
eval_type = 'holdout' output_dir = args.output_dir if not os.path.exists(output_dir): os.makedirs(output_dir) for dataset in dataset_list: train_data, test_data = load_train_test_data(dataset) for algo in algorithms: cs = _estimators[algo].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() if task == 'cls': fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy=eval_type, seed=1) hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy=eval_type, seed=1) else: fe_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy=eval_type, seed=1) hpo_evaluator = RegressionEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy=eval_type, seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION if task == 'cls' else REGRESSION,
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, time_limit=None, trial_num=0, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): super(JointBlock, self).__init__(node_list, node_index, task_type, timestamp, fe_config_space, cash_config_space, data, fixed_config=fixed_config, time_limit=time_limit, trial_num=trial_num, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) self.fixed_config = fixed_config # Combine configuration space cs = ConfigurationSpace() if fe_config_space is not None: cs.add_hyperparameters(fe_config_space.get_hyperparameters()) cs.add_conditions(fe_config_space.get_conditions()) cs.add_forbidden_clauses(fe_config_space.get_forbiddens()) if cash_config_space is not None: cs.add_hyperparameters(cash_config_space.get_hyperparameters()) cs.add_conditions(cash_config_space.get_conditions()) cs.add_forbidden_clauses(cash_config_space.get_forbiddens()) self.joint_cs = cs # Define evaluator and optimizer if self.task_type in CLS_TASKS: from solnml.components.evaluators.cls_evaluator import ClassificationEvaluator self.evaluator = ClassificationEvaluator( fixed_config=fixed_config, scorer=self.metric, data_node=self.original_data, if_imbal=self.if_imbal, timestamp=self.timestamp, output_dir=self.output_dir, seed=self.seed, resampling_strategy=self.eval_type, resampling_params=self.resampling_params) else: from solnml.components.evaluators.rgs_evaluator import RegressionEvaluator self.evaluator = RegressionEvaluator( fixed_config=fixed_config, scorer=self.metric, data_node=self.original_data, timestamp=self.timestamp, output_dir=self.output_dir, seed=self.seed, resampling_strategy=self.eval_type, resampling_params=self.resampling_params) self.optimizer = build_hpo_optimizer(self.eval_type, self.evaluator, self.joint_cs, output_dir=self.output_dir, per_run_time_limit=self.per_run_time_limit, inner_iter_num_per_iter=1, timestamp=self.timestamp, seed=self.seed, n_jobs=self.n_jobs)
def __init__(self, task_type, estimator_id: str, data: DataNode, metric, share_fe=False, output_dir='logs', per_run_time_limit=120, per_run_mem_limit=5120, dataset_id='default', eval_type='holdout', mth='rb', sw_size=3, n_jobs=1, seed=1, fe_algo='tree_based', enable_intersection=True, number_of_unit_resource=2, total_resource=30): self.task_type = task_type self.metric = metric self.number_of_unit_resource = number_of_unit_resource # One unit of resource, that's, the number of trials per iteration. self.one_unit_of_resource = 5 self.total_resource = total_resource self.per_run_time_limit = per_run_time_limit self.per_run_mem_limit = per_run_mem_limit self.estimator_id = estimator_id self.evaluation_type = eval_type self.original_data = data.copy_() self.share_fe = share_fe self.output_dir = output_dir self.n_jobs = n_jobs self.mth = mth self.seed = seed self.sliding_window_size = sw_size task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id) self.logger = get_logger(self.__class__.__name__ + '-' + task_id) np.random.seed(self.seed) # Bandit settings. # self.arms = ['fe', 'hpo'] self.arms = ['hpo', 'fe'] self.rewards = dict() self.optimizer = dict() self.evaluation_cost = dict() self.update_flag = dict() # Global incumbent. self.inc = dict() self.local_inc = dict() self.local_hist = {'fe': [], 'hpo': []} for arm in self.arms: self.rewards[arm] = list() self.update_flag[arm] = False self.evaluation_cost[arm] = list() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() self.incumbent_perf = float("-INF") self.early_stopped_flag = False self.enable_intersection = enable_intersection # Fetch hyperparameter space. if self.task_type in CLS_TASKS: from solnml.components.models.classification import _classifiers, _addons if estimator_id in _classifiers: clf_class = _classifiers[estimator_id] elif estimator_id in _addons.components: clf_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) elif self.task_type in REG_TASKS: from solnml.components.models.regression import _regressors, _addons if estimator_id in _regressors: reg_class = _regressors[estimator_id] elif estimator_id in _addons.components: reg_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = reg_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) else: raise ValueError("Unknown task type %s!" % self.task_type) self.config_space = cs self.default_config = cs.get_default_configuration() self.config_space.seed(self.seed) # Build the Feature Engineering component. if self.task_type in CLS_TASKS: fe_evaluator = ClassificationEvaluator( self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = ClassificationEvaluator( self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) elif self.task_type in REG_TASKS: fe_evaluator = RegressionEvaluator( self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = RegressionEvaluator( self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) else: raise ValueError('Invalid task type!') self.fe_algo = fe_algo self.optimizer['fe'] = build_fe_optimizer(self.fe_algo, self.evaluation_type, self.task_type, self.original_data, fe_evaluator, estimator_id, per_run_time_limit, per_run_mem_limit, self.seed, shared_mode=self.share_fe, n_jobs=n_jobs) self.inc['fe'], self.local_inc[ 'fe'] = self.original_data, self.original_data # Build the HPO component. # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20) trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource self.optimizer['hpo'] = build_hpo_optimizer( self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, trials_per_iter=trials_per_iter, seed=self.seed, n_jobs=n_jobs) self.inc['hpo'], self.local_inc[ 'hpo'] = self.default_config, self.default_config self.init_config = cs.get_default_configuration() self.local_hist['fe'].append(self.original_data) self.local_hist['hpo'].append(self.default_config)
def __init__(self, task_type, estimator_id: str, data: DataNode, metric, include_preprocessors=None, share_fe=False, output_dir='logs', per_run_time_limit=120, per_run_mem_limit=5120, dataset_id='default', eval_type='holdout', mth='rb', sw_size=3, n_jobs=1, seed=1, enable_fe=True, fe_algo='bo', number_of_unit_resource=2, total_resource=30, timestamp=None): self.task_type = task_type self.metric = metric self.number_of_unit_resource = number_of_unit_resource # One unit of resource, that's, the number of trials per iteration. self.one_unit_of_resource = 5 self.total_resource = total_resource self.per_run_time_limit = per_run_time_limit self.per_run_mem_limit = per_run_mem_limit self.estimator_id = estimator_id self.include_preprocessors = include_preprocessors self.evaluation_type = eval_type self.original_data = data.copy_() self.share_fe = share_fe self.output_dir = output_dir self.n_jobs = n_jobs self.mth = mth self.seed = seed self.sliding_window_size = sw_size task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id) self.logger = get_logger(self.__class__.__name__ + '-' + task_id) # Bandit settings. # self.arms = ['fe', 'hpo'] self.arms = ['hpo', 'fe'] self.rewards = dict() self.optimizer = dict() self.evaluation_cost = dict() self.update_flag = dict() # Global incumbent. self.inc = dict() self.local_inc = dict() self.local_hist = {'fe': [], 'hpo': []} self.inc_record = {'fe': list(), 'hpo': list()} self.exp_output = dict() self.eval_dict = {'fe': dict(), 'hpo': dict()} for arm in self.arms: self.rewards[arm] = list() self.update_flag[arm] = False self.evaluation_cost[arm] = list() self.exp_output[arm] = dict() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() self.incumbent_config = None self.incumbent_perf = float("-INF") self.early_stopped_flag = False self.first_start = True self.include_text = True if TEXT in self.original_data.feature_types else False self.include_image = True if IMAGE in self.original_data.feature_types else False # Fetch hyperparameter space. if self.task_type in CLS_TASKS: from solnml.components.models.classification import _classifiers, _addons _candidates = get_combined_candidtates(_classifiers, _addons) if estimator_id in _candidates: clf_class = _candidates[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = clf_class.get_hyperparameter_search_space() elif self.task_type in RGS_TASKS: from solnml.components.models.regression import _regressors, _addons _candidates = get_combined_candidtates(_regressors, _addons) if estimator_id in _candidates: reg_class = _candidates[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = reg_class.get_hyperparameter_search_space() else: raise ValueError("Unknown task type %s!" % self.task_type) self.config_space = cs self.default_config = cs.get_default_configuration() self.config_space.seed(self.seed) self.if_imbal = is_imbalanced_dataset(self.original_data) self.fe_config_space = get_task_hyperparameter_space( self.task_type, self.estimator_id, include_preprocessors=self.include_preprocessors, include_text=self.include_text, include_image=self.include_image, if_imbal=self.if_imbal) self.fe_default_config = self.fe_config_space.get_default_configuration( ) self.timestamp = timestamp # Build the Feature Engineering component. if self.task_type in CLS_TASKS: fe_evaluator = ClassificationEvaluator( self.default_config, self.fe_default_config, estimator_id, scorer=self.metric, data_node=self.original_data, name='fe', resampling_strategy=self.evaluation_type, if_imbal=self.if_imbal, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) hpo_evaluator = ClassificationEvaluator( self.default_config, self.fe_default_config, estimator_id, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, if_imbal=self.if_imbal, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) elif self.task_type in RGS_TASKS: fe_evaluator = RegressionEvaluator( self.default_config, self.fe_default_config, estimator_id, scorer=self.metric, data_node=self.original_data, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) hpo_evaluator = RegressionEvaluator( self.default_config, self.fe_default_config, estimator_id, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) else: raise ValueError('Invalid task type!') if self.mth != 'combined': self.enable_fe = enable_fe trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource self.optimizer['fe'] = build_fe_optimizer( self.evaluation_type, fe_evaluator, self.fe_config_space, per_run_time_limit=per_run_time_limit, per_run_mem_limit=per_run_mem_limit, inner_iter_num_per_iter=trials_per_iter, output_dir=output_dir, seed=self.seed, n_jobs=n_jobs) self.inc['fe'], self.local_inc[ 'fe'] = self.fe_default_config, self.fe_default_config # Build the HPO component. # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20) trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource self.optimizer['hpo'] = build_hpo_optimizer( self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, inner_iter_num_per_iter=trials_per_iter, seed=self.seed, n_jobs=n_jobs) self.inc['hpo'], self.local_inc[ 'hpo'] = self.default_config, self.default_config self.init_config = cs.get_default_configuration() self.local_hist['fe'].append(self.fe_default_config) self.local_hist['hpo'].append(self.default_config) else: self.rewards = list() self.evaluation_cost = list() self.eval_dict = {} trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource if self.task_type in CLS_TASKS: from solnml.utils.combined_cls_evaluator import get_combined_cs from solnml.utils.combined_cls_evaluator import CombinedClassificationEvaluator as CombinedEvaluator else: from solnml.utils.combined_rgs_evaluator import get_combined_cs from solnml.utils.combined_rgs_evaluator import CombinedRegressionEvaluator as CombinedEvaluator self.evaluator = CombinedEvaluator( estimator_id, scorer=self.metric, data_node=self.original_data, if_imbal=self.if_imbal, timestamp=self.timestamp, output_dir=self.output_dir, resampling_strategy=self.evaluation_type) cs = get_combined_cs( self.estimator_id, self.task_type, include_image=self.include_image, include_text=self.include_text, include_preprocessors=self.include_preprocessors, if_imbal=self.if_imbal) self.optimizer = build_hpo_optimizer( self.evaluation_type, self.evaluator, cs, output_dir=self.output_dir, per_run_time_limit=self.per_run_time_limit, inner_iter_num_per_iter=trials_per_iter, seed=self.seed, n_jobs=self.n_jobs)
def optimize(self): if self.inner_opt_algorithm in ['rb_hpo', 'fixed', 'alter_hpo', 'alter', 'combined']: self.optimize_explore_first() elif self.inner_opt_algorithm == 'equal': self.optimize_equal_resource() else: raise ValueError('Unsupported optimization method: %s!' % self.inner_opt_algorithm) scores = list() for _arm in self.arms: scores.append(self.sub_bandits[_arm].incumbent_perf) scores = np.array(scores) algo_idx = np.argmax(scores) self.optimal_algo_id = self.arms[algo_idx] self.incumbent_perf = scores[algo_idx] _threshold, _ensemble_size = self.incumbent_perf * 0.90, 5 if self.incumbent_perf < 0.: _threshold = self.incumbent_perf / 0.9 idxs = np.argsort(-scores)[:_ensemble_size] _algo_ids = [self.arms[idx] for idx in idxs] self.nbest_algo_ids = list() for _idx, _arm in zip(idxs, _algo_ids): if scores[_idx] >= _threshold: self.nbest_algo_ids.append(_arm) assert len(self.nbest_algo_ids) > 0 self.logger.info('=' * 50) self.logger.info('Best_algo_perf: %s' % str(self.incumbent_perf)) self.logger.info('Best_algo_id: %s' % str(self.optimal_algo_id)) self.logger.info('Nbest_algo_ids: %s' % str(self.nbest_algo_ids)) self.logger.info('Arm candidates: %s' % str(self.arms)) self.logger.info('Best val scores: %s' % str(list(scores))) self.logger.info('=' * 50) if self.inner_opt_algorithm == 'combined': tmp_evaluator = ClassificationEvaluator(None) # A tmp optimizer for recording fe transformations self.tmp_bo = AnotherBayesianOptimizationOptimizer(0, self.original_data, tmp_evaluator, 'adaboost', 1, 1, 1) # Fit the best mode best_config = self.sub_bandits[self.optimal_algo_id].incumbent_config self.best_node = self.tmp_bo.fetch_nodes_by_config([best_config])[0] best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_node.data[0], self.best_node.data[1], weight_balance=self.best_node.enable_balance, data_balance=self.best_node.data_balance, combined=True) else: # Fit the best model self.fe_optimizer = self.sub_bandits[self.optimal_algo_id].optimizer['fe'] if self.fe_algo == 'bo': self.fe_optimizer.fetch_nodes(1) best_config = self.sub_bandits[self.optimal_algo_id].inc['hpo'] best_estimator = fetch_predict_estimator(self.task_type, best_config, self.best_data_node.data[0], self.best_data_node.data[1], weight_balance=self.best_data_node.enable_balance, data_balance=self.best_data_node.data_balance) with open(os.path.join(self.output_dir, '%s-best_model' % self.timestamp), 'wb') as f: pkl.dump(best_estimator, f) if self.ensemble_method is not None: if self.inner_opt_algorithm == 'combined': eval_dict = {key: self.sub_bandits[key].eval_dict for key in self.include_algorithms} stats = fetch_ensemble_members(self.nbest_algo_ids, self.seed, eval_dict, self.tmp_bo) from solnml.components.ensemble.combined_ensemble.ensemble_bulider import EnsembleBuilder else: # stats = self.fetch_ensemble_members_ano() stats = self.fetch_ensemble_members() from solnml.components.ensemble import EnsembleBuilder # Ensembling all intermediate/ultimate models found in above optimization process. self.es = EnsembleBuilder(stats=stats, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, task_type=self.task_type, metric=self.metric, output_dir=self.output_dir) self.es.fit(data=self.original_data)
cs = get_combined_cs(estimator_id) op[estimator_id] = SMACOptimizer(evaluator, cs, inner_iter_num_per_iter=10) # Iterate (modify search strategy here) for estimator_id in estimator_ids: op[estimator_id].iterate() # Fetch ensemble members eval_dict = dict() for estimator_id in estimator_ids: eval_dict[estimator_id] = op[estimator_id].eval_dict # Important: Specify n_best ids according to search strategy nbest_ids = estimator_ids tmp_evaluator = ClassificationEvaluator(None) tmp_bo = BayesianOptimizationOptimizer( 0, train_node, tmp_evaluator, 'adaboost', 1, 1, 1) # A tmp optimizer for recording fe transformations stats = fetch_ensemble_members(nbest_ids, seed, eval_dict, record_op=tmp_bo) es = EnsembleBuilder(stats, 'ensemble_selection', 50, task_type=0, metric=balanced_accuracy_scorer, output_dir='logs/') es.fit(train_node)
def evaluate_2rd_bandit(dataset, algo, time_limit, run_id, seed): print('HMAB-%s-%s: run_id=%d' % (dataset, algo, run_id)) print('==> Start to Evaluate', dataset, 'Budget', time_limit) train_data, test_data = load_train_test_data(dataset) enable_intersect = True bandit = SecondLayerBandit(algo, train_data, per_run_time_limit=300, seed=seed, eval_type='holdout', mth='alter_hpo', enable_intersection=enable_intersect) mth_id = 'hmab' if enable_intersect else 'hmab0' _start_time = time.time() _iter_id = 0 stats = list() while True: if time.time() > time_limit + _start_time or bandit.early_stopped_flag: break res = bandit.play_once() print('Iteration %d - %.4f' % (_iter_id, res)) stats.append([_iter_id, time.time() - _start_time, res]) _iter_id += 1 print(bandit.final_rewards) print(bandit.action_sequence) print(np.mean(bandit.evaluation_cost['fe'])) print(np.mean(bandit.evaluation_cost['hpo'])) fe_optimizer = bandit.optimizer['fe'] final_train_data = fe_optimizer.apply(train_data, bandit.inc['fe']) assert final_train_data == bandit.inc['fe'] final_test_data = fe_optimizer.apply(test_data, bandit.inc['fe']) config = bandit.inc['hpo'] evaluator = ClassificationEvaluator(config, name='fe', seed=seed, resampling_strategy='holdout') val_score = evaluator(None, data_node=final_train_data) print('==> Best validation score', val_score, res) X_train, y_train = final_train_data.data clf = fetch_predict_estimator(config, X_train, y_train) X_test, y_test = final_test_data.data y_pred = clf.predict(X_test) test_score = balanced_accuracy(y_test, y_pred) print('==> Test score', test_score) # Alleviate overfitting. y_pred1 = bandit.predict(test_data.data[0]) test_score1 = balanced_accuracy(y_test, y_pred1) print('==> Test score with average ensemble', test_score1) y_pred2 = bandit.predict(test_data.data[0], is_weighted=True) test_score2 = balanced_accuracy(y_test, y_pred2) print('==> Test score with weighted ensemble', test_score2) save_path = save_dir + '%s_2rd_bandit_%s_%d_%d_%s.pkl' % ( mth_id, dataset, time_limit, run_id, algo) with open(save_path, 'wb') as f: pickle.dump([dataset, val_score, test_score, test_score1, test_score2], f)
def evaluate(mode, dataset, run_id, metric): print(mode, dataset, run_id, metric) metric = get_metric(metric) train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cs = _classifiers[algo_name].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo_name) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() fe_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, name='fe', resampling_strategy='holdout', seed=1) hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=1) fe_optimizer = BayesianOptimizationOptimizer(task_type=CLASSIFICATION, input_data=train_data, evaluator=fe_evaluator, model_id=algo_name, time_limit_per_trans=600, mem_limit_per_trans=5120, number_of_unit_resource=10, seed=1) def objective_function(config): if benchmark == 'fe': return fe_optimizer.evaluate_function(config) else: return hpo_evaluator(config) if mode == 'bo': bo = BO(objective_function, config_space, max_runs=max_runs, surrogate_model='prob_rf') bo.run() print('BO result') print(bo.get_incumbent()) perf = bo.history_container.incumbent_value runs = [bo.configurations, bo.perfs] elif mode == 'lite_bo': from litebo.facade.bo_facade import BayesianOptimization bo = BayesianOptimization(objective_function, config_space, max_runs=max_runs) bo.run() print('BO result') print(bo.get_incumbent()) perf = bo.history_container.incumbent_value runs = [bo.configurations, bo.perfs] elif mode.startswith('tlbo'): _, gp_fusion = mode.split('_') meta_feature_vec = metafeature_dict[dataset] past_datasets = test_datasets.copy() if dataset in past_datasets: past_datasets.remove(dataset) past_history = load_runhistory(past_datasets) gp_models = [ gp_models_dict[dataset_name] for dataset_name in past_datasets ] tlbo = TLBO(objective_function, config_space, past_history, gp_models=gp_models, dataset_metafeature=meta_feature_vec, max_runs=max_runs, gp_fusion=gp_fusion) tlbo.run() print('TLBO result') print(tlbo.get_incumbent()) runs = [tlbo.configurations, tlbo.perfs] perf = tlbo.history_container.incumbent_value else: raise ValueError('Invalid mode.') file_saved = '%s_%s_%s_result_%d_%d_%s.pkl' % (mode, algo_name, dataset, max_runs, run_id, benchmark) with open(data_dir + file_saved, 'wb') as f: pk.dump([perf, runs], f)
def prepare_optimizer(self, _arm): trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource if _arm == 'fe': # Build the Feature Engineering component. self.original_data._node_id = -1 inc_hpo = copy.deepcopy(self.inc['hpo']) if self.task_type in CLS_TASKS: fe_evaluator = ClassificationEvaluator( inc_hpo, self.fe_default_config, self.estimator_id, data_node=self.original_data, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, if_imbal=self.if_imbal, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) elif self.task_type in RGS_TASKS: fe_evaluator = RegressionEvaluator( inc_hpo, self.fe_default_config, self.estimator_id, data_node=self.original_data, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) else: raise ValueError('Invalid task type!') self.optimizer[_arm] = build_fe_optimizer( self.evaluation_type, fe_evaluator, self.fe_config_space, per_run_time_limit=self.per_run_time_limit, per_run_mem_limit=self.per_run_mem_limit, inner_iter_num_per_iter=trials_per_iter, output_dir=self.output_dir, seed=self.seed, n_jobs=self.n_jobs) else: # trials_per_iter = self.optimizer['fe'].evaluation_num_last_iteration // 2 # trials_per_iter = max(20, trials_per_iter) inc_fe = copy.deepcopy(self.inc['fe']) if self.task_type in CLS_TASKS: hpo_evaluator = ClassificationEvaluator( self.default_config, inc_fe, self.estimator_id, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, if_imbal=self.if_imbal, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) elif self.task_type in RGS_TASKS: hpo_evaluator = RegressionEvaluator( self.default_config, inc_fe, self.estimator_id, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed, output_dir=self.output_dir, timestamp=self.timestamp) else: raise ValueError('Invalid task type!') self.optimizer[_arm] = build_hpo_optimizer( self.evaluation_type, hpo_evaluator, self.config_space, output_dir=self.output_dir, per_run_time_limit=self.per_run_time_limit, inner_iter_num_per_iter=trials_per_iter, seed=self.seed) self.logger.debug('=' * 30) self.logger.debug('UPDATE OPTIMIZER: %s' % _arm) self.logger.debug('=' * 30)