def evaluate_autosklearn(algorithms, dataset='credit', time_limit=1200, seed=1): print('==> Start to evaluate', dataset, 'budget', time_limit) include_models = algorithms automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=time_limit, per_run_time_limit=per_run_time_limit, include_preprocessors=None, exclude_preprocessors=None, n_jobs=1, include_estimators=include_models, ensemble_memory_limit=8192, ml_memory_limit=8192, ensemble_size=1, ensemble_nbest=1, initial_configurations_via_metalearning=0, seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5} ) print(automl) raw_data = load_data(dataset, datanode_returned=True) X, y = raw_data.data automl.fit(X.copy(), y.copy()) model_desc = automl.show_models() print(model_desc) test_results = automl.cv_results_['mean_test_score'] time_records = automl.cv_results_['mean_fit_time'] best_result = np.max(test_results) print('Validation Accuracy', best_result) save_path = project_dir + 'data/ausk_%s_%d.pkl' % (dataset, len(algorithms)) with open(save_path, 'wb') as f: pickle.dump([test_results, time_records, time_limit, model_desc], f)
def evaluate_1stlayer_bandit(run_id, B, algorithms, dataset='credit', trial_num=200, seed=1): _start_time = time.time() raw_data = load_data(dataset, datanode_returned=True) bandit = FirstLayerBandit(trial_num, algorithms, raw_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, eval_type='holdout', seed=seed) bandit.B = B bandit.optimize(strategy='discounted_ucb') print(bandit.final_rewards) print(bandit.action_sequence) time_cost = time.time() - _start_time save_folder = project_dir + 'data/1stlayer-mab/' if not os.path.exists(save_folder): os.makedirs(save_folder) save_path = save_folder + 'eval_ducb_%.4f_%s_%d_%d_%d.pkl' % ( B, dataset, run_id, trial_num, len(algorithms)) with open(save_path, 'wb') as f: data = [ bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost ] pickle.dump(data, f) return time_cost
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, iter_mode=True): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if not iter_mode: optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=600, output_dir='logs') inc, val = optimizer.optimize() print(inc, val) else: import time _start_time = time.time() optimizer = SMACOptimizer( evaluator, cs, trials_per_iter=1, output_dir='logs', per_run_time_limit=180 ) results = list() for _iter in range(iter_num): perf, _, _ = optimizer.iterate() print(_iter, perf) results.append(perf) print(results) print(time.time() - _start_time)
def conduct_hpo(optimizer='smac', dataset='pc4', classifier_id='random_forest', runcount_limit=100): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if optimizer == 'smac': optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=runcount_limit, output_dir='logs') elif optimizer == 'psmac': optimizer = PSMACOptimizer(evaluator, cs, args.n, evaluation_limit=runcount_limit, output_dir='logs', trials_per_iter=args.trial) perf, cost, config = optimizer.iterate() print(perf, cost, config) perf, cost, config = optimizer.iterate() print(perf, cost, config)
def evaluate_evaluation_based_fe(dataset, time_limit, seed=1): # Prepare the configuration for random forest. from ConfigSpace.hyperparameters import UnParametrizedHyperparameter from autosklearn.pipeline.components.classification.random_forest import RandomForest cs = RandomForest.get_hyperparameter_search_space() clf_hp = UnParametrizedHyperparameter("estimator", 'random_forest') cs.add_hyperparameter(clf_hp) evaluator = Evaluator(cs.get_default_configuration(), name='fe', seed=seed) raw_data = load_data(dataset, datanode_returned=True) pipeline = FEPipeline(fe_enabled=True, optimizer_type='eval_base', time_budget=time_limit, evaluator=evaluator, seed=seed, model_id='random_forest', time_limit_per_trans=300) train_data = pipeline.fit_transform(raw_data) score = evaluator(None, data_node=train_data) print('==> Base validation score', score) save_path = proj_dir + 'data/fe_%s_%d.pkl' % (dataset, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, score], f) return score
def evaluate_2rd_layered_bandit(run_id, mth='rb', dataset='pc4', algo='libsvm_svc', cv='holdout', iter_num=100, time_limit=120000, seed=1): raw_data = load_data(dataset, datanode_returned=True) strategy = 'avg' if mth != 'alter-rb' else 'rb' mth_id = mth if mth != 'alter-rb' else 'alter' bandit = SecondLayerBandit(algo, raw_data, dataset_id=dataset, mth=mth_id, strategy=strategy, seed=seed, eval_type=cv) _start_time = time.time() stats = list() for _iter in range(iter_num): _iter_start_time = time.time() bandit.play_once() stats.append([iter, time.time() - _start_time]) if time.time() > time_limit + _start_time: break print('%s%s' % ('\n', '=' * 65)) end_time = time.time() print('| %s-%s-%d | Iteration-%d: %.4f | Time_cost: %.2f-%.2f |' % (dataset, mth, run_id, _iter, bandit.final_rewards[-1], end_time - _iter_start_time, end_time - _start_time)) print('=' * 65, '\n') # Save the intermediate result. save_folder = project_dir + 'data/2rdlayer-mab/' if not os.path.exists(save_folder): os.makedirs(save_folder) file_path = save_folder + '%s-%d_2rdlayer-mab_%s_%s_%d_%d_%s.pkl' % ( mth, seed, dataset, algo, iter_num, time_cost, cv) data = [ bandit.final_rewards, bandit.action_sequence, bandit.evaluation_cost, stats ] with open(file_path, 'wb') as f: pickle.dump(data, f)
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def evaluate_ausk_fe(dataset, time_limit, seed=1): print('==> Start to Evaluate', dataset, 'Budget', time_limit) from automlToolkit.utils.models.default_random_forest import DefaultRandomForest # Add random forest classifier (with default hyperparameter) component to auto-sklearn. autosklearn.pipeline.components.classification.add_classifier( DefaultRandomForest) include_models = ['DefaultRandomForest'] # Construct the ML model. automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=time_limit, include_preprocessors=None, n_jobs=1, include_estimators=include_models, ensemble_memory_limit=8192, ml_memory_limit=8192, ensemble_size=1, ensemble_nbest=1, initial_configurations_via_metalearning=0, per_run_time_limit=600, seed=seed, resampling_strategy='cv', resampling_strategy_arguments={'folds': 5}) print(automl) data_node = load_data(dataset, datanode_returned=True) X, y = data_node.data automl.fit(X.copy(), y.copy()) model_desc = automl.show_models() print(model_desc) all_test_results = automl.cv_results_['mean_test_score'] print('Mean test score', all_test_results) best_result = np.max(automl.cv_results_['mean_test_score']) print('Validation Accuracy', best_result) save_path = proj_dir + 'data/ausk_fe_%s_%d.pkl' % (dataset, time_limit) with open(save_path, 'wb') as f: pickle.dump([dataset, best_result], f) return best_result
def evaluate_1stlayer_bandit(algorithms, mode, dataset='credit', trial_num=200, seed=1): _start_time = time.time() raw_data = load_data(dataset, datanode_returned=True) bandit = FirstLayerBandit(trial_num, algorithms, raw_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, share_feature=mode, seed=seed) bandit.optimize() print(bandit.final_rewards) print(bandit.action_sequence) time_cost = time.time() - _start_time save_path = project_dir + 'data/shared_hmab_%d_%s_%d_%d_%d.pkl' % ( mode, dataset, trial_num, len(algorithms), seed) with open(save_path, 'wb') as f: data = [bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost] pickle.dump(data, f) return time_cost
def evaluate_2armed_bandit(dataset='pc4', algo='libsvm_svc', time_limit=120000): raw_data = load_data(dataset, datanode_returned=True) bandit = SecondLayerBandit(algo, raw_data) _start_time = time.time() stats = list() for iter in range(20): res = bandit.play_once() stats.append([iter, time.time() - _start_time, res]) if time.time() > time_limit + _start_time: break print(bandit.final_rewards) print(bandit.action_sequence) print(bandit.evaluation_cost['fe']) print(bandit.evaluation_cost['hpo']) print(np.mean(bandit.evaluation_cost['fe'])) print(np.mean(bandit.evaluation_cost['hpo']))
import os import sys sys.path.append(os.getcwd()) from autosklearn.smbo import AutoMLSMBO from autosklearn.constants import * from autosklearn.data.xy_data_manager import XYDataManager from autosklearn.util.backend import create from autosklearn.util import pipeline, StopWatch from automlToolkit.datasets.utils import load_data dataset_name = 'diabetes' X, y, _ = load_data(dataset_name) def get_meta_learning_configs(X, y, task_type, dataset_name, metric='accuracy', num_cfgs=5): backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info,
import os import sys import argparse sys.path.append(os.getcwd()) from automlToolkit.datasets.utils import load_data parser = argparse.ArgumentParser() dataset_set = 'diabetes,spectf,credit,ionosphere,lymphography,pc4,' \ 'messidor_features,winequality_red,winequality_white,splice,spambase,amazon_employee' parser.add_argument('--datasets', type=str, default=dataset_set) args = parser.parse_args() for dataset in args.datasets.split(','): raw_data = load_data(dataset, datanode_returned=True) print(raw_data)