def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') bandit.optimize() time_taken = time.time() - _start_time model_desc = [ bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence ] validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [ dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc ] print(model_desc) print(data) save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f)
def operate(self, input_datanode, target_fields=None): output_datanode = input_datanode.copy_() output_datanode.trans_hist.append(self.type) if is_unbalanced_dataset(input_datanode): output_datanode.data_balance = 1 return output_datanode
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) exclude_datasets = ['gina_prior2', 'pc2', 'abalone', 'wind', 'waveform-5000(2)', 'page-blocks(1)', 'winequality_white', 'pollen'] alad = AlgorithmAdvisor(task_type=MULTICLASS_CLS, n_algorithm=9, metric='bal_acc', exclude_datasets=exclude_datasets) n_algo = 5 assert dataset in exclude_datasets meta_infos = alad.fit_meta_learner() assert dataset not in meta_infos model_candidates = alad.fetch_algorithm_set(dataset) include_models = list() print(model_candidates) for algo in model_candidates: if algo in algorithms and len(include_models) < n_algo: include_models.append(algo) print('After algorithm recommendation', include_models) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, include_models, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') bandit.optimize() time_taken = time.time() - _start_time model_desc = [bandit.nbest_algo_ids, bandit.optimal_algo_id, bandit.final_rewards, bandit.action_sequence] validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, time_taken, model_desc] print(model_desc) print(data) save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f)
def fit(self, train_data: DataNode, dataset_id=None): """ this function includes this following two procedures. 1. tune each algorithm's hyperparameters. 2. engineer each algorithm's features automatically. :param train_data: :return: """ if self.enable_meta_algorithm_selection: try: alad = AlgorithmAdvisor(task_type=self.task_type, n_algorithm=9, metric=self.metric_id) n_algo = 5 model_candidates = alad.fetch_algorithm_set( train_data, dataset_id=dataset_id) include_models = list() for algo in model_candidates: if algo in self.include_algorithms and len( include_models) < n_algo: include_models.append(algo) self.include_algorithms = include_models self.logger.info( 'Executing meta-learning based algorithm recommendation!') self.logger.info('Algorithms recommended: %s' % ','.join(self.include_algorithms)) except Exception as e: self.logger.error("Meta-learning failed!") # Check whether this dataset is balanced or not. if self.task_type in CLS_TASKS and is_unbalanced_dataset(train_data): # self.include_algorithms = imb_classication_algorithms self.logger.info('Input dataset is imbalanced!') train_data = DataBalancer().operate(train_data) if self.amount_of_resource is None: trial_num = len(self.include_algorithms) * 30 else: trial_num = self.amount_of_resource self.solver = FirstLayerBandit( self.task_type, trial_num, self.include_algorithms, train_data, per_run_time_limit=self.per_run_time_limit, dataset_name=self.dataset_name, ensemble_method=self.ensemble_method, ensemble_size=self.ensemble_size, inner_opt_algorithm='fixed', metric=self.metric, fe_algo='bo', seed=self.seed, time_limit=self.time_limit, eval_type=self.evaluation_type, output_dir=self.output_dir) self.solver.optimize()
def evaluate_autosklearn(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set(train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.balancer.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='holdout') while time.time() - _start_time < time_limit: bandit.sub_bandits['random_forest'].optimizer['hpo'].iterate() # bandit.optimize() # fe_exp_output = bandit.sub_bandits['random_forest'].exp_output['fe'] # hpo_exp_output = bandit.sub_bandits['random_forest'].exp_output['hpo'] fe_exp_output = dict() hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output inc_config = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_config.get_dictionary() inc_config.pop('estimator') from solnml.components.models.classification.random_forest import RandomForest rf = RandomForest(**inc_config) rf.fit(train_data.data[0], train_data.data[1]) validation_accuracy = bandit.sub_bandits['random_forest'].optimizer['hpo'].incumbent_perf best_pred = rf.predict(test_data.data[0]) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) # es_pred = bandit._es_predict(test_data) # test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [dataset, validation_accuracy, test_accuracy, fe_exp_output, hpo_exp_output, _start_time] save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( ausk_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f) del_path = './logs/' for i in os.listdir(del_path): file_data = del_path + "/" + i if os.path.isfile(file_data): os.remove(file_data)
def __init__(self, name, task_type, datanode, seed=1): self.name = name self._seed = seed self.root_node = datanode.copy_() self.incumbent = self.root_node self.task_type = task_type self.graph = TransformationGraph() self.graph.add_node(self.root_node) self.time_budget = None self.maximum_evaluation_num = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.if_bal = False if is_unbalanced_dataset( data_node=datanode) else True
def evaluate_hmab(algorithms, dataset, run_id, trial_num, seed, time_limit=1200): print('%s-%s-%d: %d' % (hmab_flag, dataset, run_id, time_limit)) _start_time = time.time() train_data, test_data = load_train_test_data(dataset, task_type=MULTICLASS_CLS) cls_task_type = BINARY_CLS if len(set( train_data.data[1])) == 2 else MULTICLASS_CLS balanced_acc_metric = make_scorer(balanced_accuracy) if is_unbalanced_dataset(train_data): from solnml.components.feature_engineering.transformations.preprocessor.smote_balancer import DataBalancer train_data = DataBalancer().operate(train_data) bandit = FirstLayerBandit(cls_task_type, trial_num, algorithms, train_data, output_dir='logs', per_run_time_limit=per_run_time_limit, dataset_name=dataset, ensemble_size=50, inner_opt_algorithm=opt_algo, metric=balanced_acc_metric, fe_algo='bo', seed=seed, time_limit=time_limit, eval_type='partial') # while time.time()-_start_time<time_limit: # bandit.sub_bandits['random_forest'].optimizer['fe'].iterate() # # print(bandit.sub_bandits['random_forest'].optimizer['hpo'].exp_output) bandit.optimize() fe_exp_output = bandit.sub_bandits['random_forest'].optimizer[ 'fe'].exp_output hpo_exp_output = bandit.sub_bandits['random_forest'].optimizer[ 'hpo'].exp_output validation_accuracy = np.max(bandit.final_rewards) best_pred = bandit._best_predict(test_data) test_accuracy = balanced_accuracy(test_data.data[1], best_pred) bandit.refit() es_pred = bandit._es_predict(test_data) test_accuracy_with_ens = balanced_accuracy(test_data.data[1], es_pred) data = [ dataset, validation_accuracy, test_accuracy, test_accuracy_with_ens, fe_exp_output, hpo_exp_output, _start_time ] save_path = project_dir + '%s_%s_%s_%d_%d_%d_%d_%d.pkl' % ( hmab_flag, opt_algo, dataset, trial_num, len(algorithms), seed, run_id, time_limit) with open(save_path, 'wb') as f: pickle.dump(data, f) del_path = './logs/' for i in os.listdir(del_path): file_data = del_path + "/" + i if os.path.isfile(file_data): os.remove(file_data)