def check_datasets(datasets): for _dataset in datasets: try: _ = load_data(_dataset, '../soln-ml/', False, task_type=task_type) except Exception as e: print("Exception:", e) raise ValueError('Dataset - %s does not exist!' % _dataset)
def evaluate_1stlayer_bandit(algorithms, mode, dataset='credit', trial_num=200, seed=1): _start_time = time.time() raw_data = load_data(dataset, datanode_returned=True) bandit = FirstLayerBandit(trial_num, algorithms, raw_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, share_feature=mode, seed=seed) bandit.optimize() print(bandit.final_rewards) print(bandit.action_sequence) time_cost = time.time() - _start_time save_path = project_dir + 'data/shared_hmab_%d_%s_%d_%d_%d.pkl' % ( mode, dataset, trial_num, len(algorithms), seed) with open(save_path, 'wb') as f: data = [ bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost ] pickle.dump(data, f) return time_cost
def evaluate_ml_algorithm(dataset, algo, obj_metric, seed=1, task_type=None): print('EVALUATE-%s-%s-%s' % (dataset, algo, obj_metric)) train_data = load_data(dataset, task_type=task_type, datanode_returned=True) print(set(train_data.data[1])) metric = get_metric(obj_metric) cs = _classifiers[algo].get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", algo) cs.add_hyperparameter(model) default_hpo_config = cs.get_default_configuration() hpo_evaluator = ClassificationEvaluator(default_hpo_config, scorer=metric, data_node=train_data, name='hpo', resampling_strategy='holdout', seed=seed) hpo_optimizer = SMACOptimizer(evaluator=hpo_evaluator, config_space=cs, per_run_time_limit=600, per_run_mem_limit=5120, output_dir='./logs', trials_per_iter=args.iter) hpo_optimizer.iterate() hpo_eval_dict = dict() for key, value in hpo_optimizer.eval_dict.items(): hpo_eval_dict[key[1]] = value save_path = save_dir + '%s-%s-%s-hpo.pkl' % (dataset, algo, obj_metric) with open(save_path, 'wb') as f: pickle.dump(hpo_eval_dict, f)
def conduct_hpo(dataset='pc4', classifier_id='random_forest', iter_num=100, iter_mode=True): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if not iter_mode: optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=600, output_dir='logs') inc, val = optimizer.optimize() print(inc, val) else: import time _start_time = time.time() optimizer = SMACOptimizer( evaluator, cs, trials_per_iter=1, output_dir='logs', per_run_time_limit=180 ) results = list() for _iter in range(iter_num): perf, _, _ = optimizer.iterate() print(_iter, perf) results.append(perf) print(results) print(time.time() - _start_time)
def evaluate_1stlayer_bandit(run_id, B, algorithms, dataset='credit', trial_num=200, seed=1): _start_time = time.time() raw_data = load_data(dataset, datanode_returned=True) bandit = FirstLayerBandit(trial_num, algorithms, raw_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, eval_type='holdout', seed=seed) bandit.B = B bandit.optimize(strategy='discounted_ucb') print(bandit.final_rewards) print(bandit.action_sequence) time_cost = time.time() - _start_time save_folder = project_dir + 'data/1stlayer-mab/' if not os.path.exists(save_folder): os.makedirs(save_folder) save_path = save_folder + 'eval_ducb_%.4f_%s_%d_%d_%d.pkl' % ( B, dataset, run_id, trial_num, len(algorithms)) with open(save_path, 'wb') as f: data = [bandit.final_rewards, bandit.time_records, bandit.action_sequence, time_cost] pickle.dump(data, f) return time_cost
def get_meta_learning_configs(X, y, task_type, dataset_name='default', metric='accuracy', num_cfgs=5): if X is None or y is None: X, y, _ = load_data(dataset_name) backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info, include_estimators=None, exclude_estimators=None, include_preprocessors=None, exclude_preprocessors=None) watcher = StopWatch() name = os.path.basename(dm.name) watcher.start_task(name) def reset_data_manager(max_mem=None): pass automlsmbo = AutoMLSMBO( config_space=configuration_space, dataset_name=dataset_name, backend=backend, total_walltime_limit=1e5, func_eval_time_limit=1e5, memory_limit=1e5, metric=metric, watcher=watcher, metadata_directory='components/meta_learning/meta_resource', num_metalearning_cfgs=num_cfgs) automlsmbo.reset_data_manager = reset_data_manager automlsmbo.task = task_type automlsmbo.datamanager = dm configs = automlsmbo.get_metalearning_suggestions() return configs
def conduct_hpo(optimizer='smac', dataset='pc4', classifier_id='random_forest', runcount_limit=100): from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) raw_data = load_data(dataset, datanode_returned=True) print(set(raw_data.data[1])) evaluator = ClassificationEvaluator(cs.get_default_configuration(), name='hpo', data_node=raw_data) if optimizer == 'smac': optimizer = SMACOptimizer(evaluator, cs, evaluation_limit=runcount_limit, output_dir='logs') elif optimizer == 'psmac': optimizer = PSMACOptimizer(evaluator, cs, args.n, evaluation_limit=runcount_limit, output_dir='logs', trials_per_iter=args.trial) perf, cost, config = optimizer.iterate() print(perf, cost, config) perf, cost, config = optimizer.iterate() print(perf, cost, config)
min_impurity_decrease=self.min_impurity_decrease, random_state=self.random_state, n_jobs=self.n_jobs, class_weight=self.class_weight, warm_start=True) self.estimator.fit(X, y, sample_weight=sample_weight) return self def predict(self, X): if self.estimator is None: raise NotImplementedError() return self.estimator.predict(X) dataset_list = dataset_str.split(',') check_datasets(dataset_list) cs = get_cs() _run_count = min(int(len(set(cs.sample_configuration(30000))) * 0.75), run_count) print(_run_count) for dataset in dataset_list: node = load_data(dataset, '../soln-ml/', True, task_type=0) _x, _y = node.data[0], node.data[1] eval = partial(eval_func, x=_x, y=_y) bo = BO(eval, cs, max_runs=_run_count, time_limit_per_trial=600, sample_strategy=mode, rng=np.random.RandomState(1)) bo.run() with open('logs/%s-random_forest-%s-%d.pkl' % (dataset, mode, run_count), 'wb')as f: pickle.dump(bo.get_history().data, f)
def get_benchmark_configspace(benchmark_id): if benchmark_id == 'fcnet': cs = ConfigurationSpace() learning_rate = UniformFloatHyperparameter("learning_rate", 1e-4, 1e-2, default_value=1e-3, q=2e-4) momentum = UniformFloatHyperparameter("momentum", 0., .5, default_value=0., q=.1) lr_decay = UniformFloatHyperparameter("lr_decay", .7, .99, default_value=9e-1, q=3e-2) n_layer1 = UniformIntegerHyperparameter("n_layer1", 32, 256, default_value=96, q=8) n_layer2 = UniformIntegerHyperparameter("n_layer2", 64, 256, default_value=128, q=8) batch_size = UniformIntegerHyperparameter("batch_size", 32, 128, default_value=64, q=8) dropout1 = UniformFloatHyperparameter("kb_1", .3, .9, default_value=.5, q=.1) dropout2 = UniformFloatHyperparameter("kb_2", .3, .9, default_value=.5, q=.1) kernel_regularizer = UniformFloatHyperparameter("k_reg", 1e-9, 1e-4, default_value=1e-6, q=5e-7, log=True) cs.add_hyperparameters([ learning_rate, momentum, lr_decay, n_layer1, n_layer2, batch_size, dropout1, dropout2, kernel_regularizer ]) elif benchmark_id in ['covtype', 'higgs']: cs = ConfigurationSpace() # n_estimators = UniformFloatHyperparameter("n_estimators", 100, 600, default_value=200, q=10) eta = UniformFloatHyperparameter("eta", 0.01, 0.9, default_value=0.3, q=0.01) min_child_weight = UniformFloatHyperparameter("min_child_weight", 0, 10, default_value=1, q=0.1) max_depth = UniformIntegerHyperparameter("max_depth", 1, 12, default_value=6) subsample = UniformFloatHyperparameter("subsample", 0.1, 1, default_value=1, q=0.1) gamma = UniformFloatHyperparameter("gamma", 0, 10, default_value=0, q=0.1) colsample_bytree = UniformFloatHyperparameter("colsample_bytree", 0.1, 1, default_value=1., q=0.1) alpha = UniformFloatHyperparameter("alpha", 0, 10, default_value=0., q=0.1) _lambda = UniformFloatHyperparameter("lambda", 1, 10, default_value=1, q=0.1) cs.add_hyperparameters([ eta, min_child_weight, max_depth, subsample, gamma, colsample_bytree, alpha, _lambda ]) elif benchmark_id in ['covtype_svm', 'mnist_svm']: C = UniformFloatHyperparameter("C", 1e-3, 1e5, log=True, default_value=1.0) kernel = CategoricalHyperparameter("kernel", choices=["rbf", "poly", "sigmoid"], default_value="rbf") degree = UniformIntegerHyperparameter("degree", 2, 5, default_value=3) gamma = UniformFloatHyperparameter("gamma", 1e-5, 10, log=True, default_value=0.1) coef0 = UniformFloatHyperparameter("coef0", -1, 1, default_value=0) tol = UniformFloatHyperparameter("tol", 1e-5, 1e-1, default_value=1e-3, log=True) # cache size is not a hyperparameter, but an argument to the program! max_iter = UnParametrizedHyperparameter("max_iter", 10000) cs = ConfigurationSpace() cs.add_hyperparameters( [C, kernel, degree, gamma, coef0, tol, max_iter]) degree_depends_on_poly = EqualsCondition(degree, kernel, "poly") coef0_condition = InCondition(coef0, kernel, ["poly", "sigmoid"]) cs.add_condition(degree_depends_on_poly) cs.add_condition(coef0_condition) elif benchmark_id == 'cifar': cs = ConfigurationSpace() # padding_size = CategoricalHyperparameter('padding_size', [1, 2, 3], default_value=2) # batch_size = CategoricalHyperparameter('train_batch_size', [256]) batch_size = UniformIntegerHyperparameter("train_batch_size", 32, 256, default_value=64, q=8) init_lr = UniformFloatHyperparameter('init_lr', lower=1e-3, upper=0.3, default_value=0.1, log=True) # lr_decay_factor = UniformFloatHyperparameter('lr_decay_factor', lower=0.01, upper=0.2, default_value=0.1, # log=True) lr_decay_factor = UnParametrizedHyperparameter('lr_decay_factor', 0.1) weight_decay = UniformFloatHyperparameter('weight_decay', lower=1e-5, upper=1e-2, default_value=0.0002, log=True) momentum = UniformFloatHyperparameter("momentum", 0.5, .99, default_value=0.9) nesterov = CategoricalHyperparameter('nesterov', ['True', 'False'], default_value='True') cs.add_hyperparameters([ nesterov, batch_size, init_lr, lr_decay_factor, weight_decay, momentum ]) elif benchmark_id == 'convnet': cs = ConfigurationSpace() learning_rate = UniformFloatHyperparameter("learning_rate", 1e-5, 5e-2, default_value=1e-4, q=3e-5, log=True) batch_size = UniformIntegerHyperparameter("batch_size", 16, 128, q=16, default_value=32) momentum = UniformFloatHyperparameter("momentum", 0., .5, default_value=0., q=.1) lr_decay = UniformFloatHyperparameter("lr_decay", .7, .99, default_value=9e-1, q=3e-2) dropout_value = UniformFloatHyperparameter("dropout", .1, .7, default_value=.5, q=.1) cs.add_hyperparameters( [learning_rate, batch_size, momentum, lr_decay, dropout_value]) num_pooling_layer = UniformIntegerHyperparameter("n_pooling_layer", 2, 3, default_value=2) num_conv_layer1 = UniformIntegerHyperparameter("n_conv_layer1", 16, 64, default_value=32, q=2) num_conv_layer2 = UniformIntegerHyperparameter("n_conv_layer2", 32, 96, default_value=64, q=2) num_conv_layer3 = UniformIntegerHyperparameter("n_conv_layer3", 32, 96, default_value=64, q=2) num_fully_layer = UniformIntegerHyperparameter("n_fully_unit", 128, 512, default_value=256, q=64) cs.add_hyperparameters([ num_pooling_layer, num_conv_layer1, num_conv_layer2, num_conv_layer3, num_fully_layer ]) for i in [1, 2, 3]: kernel_init_stddev = UniformFloatHyperparameter( "kernel_init_stddev%d" % i, 1e-3, 5e-2, default_value=1e-2, q=2e-3) kernel_regularizer = UniformFloatHyperparameter( "kernel_regularizer%d" % i, 1e-9, 1e-4, default_value=1e-6, q=5e-7, log=True) cs.add_hyperparameters([kernel_init_stddev, kernel_regularizer]) if i == 3: k_init_cond = InCondition(child=kernel_init_stddev, parent=num_pooling_layer, values=[3]) k_reg_cond = InCondition(child=kernel_regularizer, parent=num_pooling_layer, values=[3]) cs.add_conditions([k_init_cond, k_reg_cond]) return cs elif 'sys' in benchmark_id: from mfes.evaluate_function.sys.combined_evaluator import get_combined_cs from solnml.datasets.utils import load_data tmp_node = load_data('balloon', data_dir='../soln-ml/', task_type=0, datanode_returned=True) cs = get_combined_cs(tmp_node) return cs else: raise ValueError('Invalid benchmark id: %s!' % benchmark_id) return cs
def check_datasets(datasets, task_type=None): for _dataset in datasets: try: _ = load_data(_dataset, task_type=task_type) except Exception as e: raise ValueError('Dataset - %s does not exist!' % _dataset)
import os import sys sys.path.append(os.getcwd()) from autosklearn.smbo import AutoMLSMBO from autosklearn.constants import * from autosklearn.data.xy_data_manager import XYDataManager from autosklearn.util.backend import create from autosklearn.util import pipeline, StopWatch from solnml.datasets.utils import load_data dataset_name = 'diabetes' X, y, _ = load_data(dataset_name) def get_meta_learning_configs(X, y, task_type, dataset_name, metric='accuracy', num_cfgs=5): backend = create(temporary_directory=None, output_directory=None, delete_tmp_folder_after_terminate=False, delete_output_folder_after_terminate=False, shared_mode=True) dm = XYDataManager(X, y, None, None, task_type, None, dataset_name) configuration_space = pipeline.get_configuration_space( dm.info,
import os import sys import argparse sys.path.append(os.getcwd()) from solnml.datasets.utils import load_data parser = argparse.ArgumentParser() dataset_set = 'diabetes,spectf,credit,ionosphere,lymphography,pc4,' \ 'messidor_features,winequality_red,winequality_white,splice,spambase,amazon_employee' parser.add_argument('--datasets', type=str, default=dataset_set) args = parser.parse_args() for dataset in args.datasets.split(','): raw_data = load_data(dataset, datanode_returned=True) print(raw_data)
cs.add_hyperparameter(hp) for cond in fe_cs.get_conditions(): cs.add_condition(cond) for bid in fe_cs.get_forbiddens(): cs.add_forbidden_clause(bid) return cs def get_fit_params(y, estimator): from solnml.components.utils.balancing import get_weights _init_params, _fit_params = get_weights( y, estimator, None, {}, {}) return _init_params, _fit_params tmp_node = load_data('letter(1)', data_dir='./', task_type=0, datanode_returned=True) tmp_evaluator = ClassificationEvaluator(None) tmp_bo = AnotherBayesianOptimizationOptimizer(0, tmp_node, tmp_evaluator, 'adaboost', 1, 1, 1) @ease_target(model_dir="./data/models", name='sys') def train(resource_num, params, data_node): print(resource_num, params) start_time = time.time() resource_num = resource_num * 1.0 / 27 # Prepare data node. data_node = data_node['data_node'] _data_node = tmp_bo._parse(data_node, params) X_train, y_train = _data_node.data