def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) exclude_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)', 'page-blocks(1)', 'winequality_white', 'pollen'] alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9, metric='bal_acc', exclude_datasets=exclude_datasets) n_algo = 5 assert dataset in exclude_datasets meta_infos = alad.fit_meta_learner() assert dataset not in meta_infos model_candidates = alad.fetch_algorithm_set(dataset) include_models = list() print(model_candidates) for algo in model_candidates: if algo in algorithms and len(include_models) < n_algo: include_models.append(algo) print('After algorithm recommendation', include_models) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, include_models, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') bandit.optimize() time_taken = time.time() - _start_time model_desc = [bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence] validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc] print(model_desc) print(data) save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f)
def fit(self, train_data: DataNode, dataset_id=None): """ this function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ if self.enable_meta_algorithm_selection: try: alad = AlgorithmAdvisor(task_type=self.task_type, n_algorithm=9, metric=self.metric_id) n_algo = 5 model_candidates = alad.fetch_algorithm_set( train_data, dataset_id=dataset_id) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo: include_models.append(algo) self.include_algorithms = include_models self.logger.info( 'Executing meta-learning based algorithm recommendation!') self.logger.info('Algorithms recommended: %s' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error("Meta-learning failed!") # Check whether this dataset is balanced or not. if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data): # self.include_algorithms = imb_classication_algorithms self.logger.info('Input dataset is imbalanced!') train_data = DataBalancer().operate(train_data) if self.amount_of_resource is None: trial_num = len(self.include_algorithms) * 30 else: trial_num = self.amount_of_resource self.solver = FirstLayerBandit( self.task_type, trial_num, self.include_algorithms, train_data, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm='fixed', metric=self.metric, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize()
def evaluate_autosklearn(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('AUSK-%s-%d: %d' % (dataset, run_id, time_limit)) ausk_flag = 'ausk_full' if ausk_flag == 'ausk_alad': alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9, metric='acc') meta_infos = alad.fit_meta_learner() assert dataset not in meta_infos model_candidates = alad.fetch_algorithm_set(dataset) include_models = list() print(model_candidates) for algo in model_candidates: if algo in algorithms and len(include_models) < 3: include_models.append(algo) print('After algorithm recommendation', include_models) n_config_meta_learning = 0 ensemble_size = 1 elif ausk_flag == 'ausk_no_meta': include_models = algorithms n_config_meta_learning = 25 ensemble_size = 1 elif ausk_flag == 'ausk_full': include_models = algorithms n_config_meta_learning = 25 ensemble_size = 50 else: include_models = algorithms n_config_meta_learning = 0 ensemble_size = 1 automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=time_limit, per_run_time_limit=per_run_time_limit, include_preprocessors=None, exclude_preprocessors=None, n_jobs=1, include_estimators=include_models, ensemble_memory_limit=8192, ml_memory_limit=8192, ensemble_size=ensemble_size, ensemble_nbest=ensemble_size, initial_configurations_via_metalearning=n_config_meta_learning, seed=int(seed), resampling_strategy='holdout', resampling_strategy_arguments={'train_size': 0.67} ) print(automl) train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) X, y = train_data.data feat_type = ['Categorical' if _type == CATEGORICAL else 'Numerical' for _type in train_data.feature_types] from autosklearn.metrics import balanced_accuracy automl.fit(X.copy(), y.copy(), metric=balanced_accuracy, feat_type=feat_type) model_desc = automl.show_models() print(model_desc) val_result = np.max(automl.cv_results_['mean_test_score']) print('Trial number', len(automl.cv_results_['mean_test_score'])) print('Best validation accuracy', val_result) X_test, y_test = test_data.data automl.refit(X.copy(), y.copy()) y_pred = automl.predict(X_test) metric = balanced_accuracy test_result = metric(y_test, y_pred) print('Test accuracy', test_result) save_path = project_dir + '%s_%s_%d_%d_%d_%d_%d.pkl' % ( ausk_flag, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, val_result, test_result, model_desc], f)
import os import sys import numpy as np sys.path.append(os.getcwd()) from solnml.components.meta_learning.algorithm_recomendation.algorithm_advisor import AlgorithmAdvisor from solnml.components.meta_learning.algorithm_recomendation.meta_generator import get_feature_vector from solnml.components.utils.constants import MULTICLASS_CLS test_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)', 'page-blocks(1)', 'winequality_white', 'pollen'] alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, exclude_datasets=test_datasets, n_algorithm=5, metric='bal_acc') meta_infos = alad.fit_meta_learner() datasets = [item[0] for item in meta_infos] print(datasets) # 1.0, 2.0 def topk(l1, l2): score = 0 for item in l1[:5]: if item in l2[:5]: score += 1 return score scores = list() for test_dataset in test_datasets: print(test_dataset in datasets) meta_feature = get_feature_vector(test_dataset, dataset_id=test_dataset, task_type=MULTICLASS_CLS) algorithms, preds = alad.predict_meta_learner(meta_feature)