def _parse(self, data_node: DataNode, config, record=False, skip_balance=False): """ Transform the data node based on the pipeline specified by configuration. :param data_node: :param config: :param record: :return: the resulting data node. """ # Remove the indicator in config_dict. config_dict = config.get_dictionary().copy() pre1_id = config_dict['preprocessor1'] config_dict.pop('preprocessor1') pre2_id = config_dict['preprocessor2'] config_dict.pop('preprocessor2') if skip_balance: bal_id = 'empty' else: if 'balancer' in config_dict: bal_id = config_dict['balancer'] config_dict.pop('balancer') else: bal_id = 'empty' gen_id = config_dict['generator'] config_dict.pop('generator') res_id = config_dict['rescaler'] config_dict.pop('rescaler') sel_id = config_dict['selector'] config_dict.pop('selector') def tran_operate(id, tran_set, config, node): if id != "empty": _config = {} for key in config: if id in key: config_name = key.split(':')[1] _config[config_name] = config[key] tran = tran_set[id](**_config) output_node = tran.operate(node) return output_node, tran return node, None _node = data_node.copy_() # Preprocessor1 _node, pre1_tran = tran_operate(pre1_id, _preprocessor1, config_dict, _node) # Preprocessor2 _node, pre2_tran = tran_operate(pre2_id, _preprocessor2, config_dict, _node) # Balancer _node, bal_tran = tran_operate(bal_id, _balancer, config_dict, _node) # Rescaler _node, res_tran = tran_operate(res_id, _rescaler, config_dict, _node) # Generator _node, gen_tran = tran_operate(gen_id, _generator, config_dict, _node) # Selector _node, sel_tran = tran_operate(sel_id, _selector, config_dict, _node) if record: return _node, [ pre1_tran, pre2_tran, bal_tran, res_tran, gen_tran, sel_tran ] return _node
def __init__(self, task_type, trial_num, classifier_ids: List[str], data: DataNode, metric='acc', ensemble_method='ensemble_selection', ensemble_size=10, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', share_feature=False, inner_opt_algorithm='rb', fe_algo='bo', time_limit=None, n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ self.timestamp = time.time() self.task_type = task_type self.metric = get_metric(metric) self.original_data = data.copy_() self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.trial_num = trial_num self.n_jobs = n_jobs self.alpha = 4 self.B = 0.01 self.seed = seed self.shared_mode = share_feature self.output_dir = output_dir np.random.seed(self.seed) # Best configuration. self.optimal_algo_id = None self.nbest_algo_ids = None self.best_lower_bounds = None self.es = None # Set up backend. self.dataset_name = dataset_name self.time_limit = time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Bandit settings. self.incumbent_perf = -float("INF") self.arms = classifier_ids self.include_algorithms = classifier_ids self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.fe_datanodes = dict() self.eval_type = eval_type self.fe_algo = fe_algo self.inner_opt_algorithm = inner_opt_algorithm for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.fe_datanodes[arm] = list() self.sub_bandits[arm] = SecondLayerBandit( self.task_type, arm, self.original_data, metric=self.metric, output_dir=output_dir, per_run_time_limit=per_run_time_limit, share_fe=self.shared_mode, seed=self.seed, eval_type=eval_type, dataset_id=dataset_name, n_jobs=self.n_jobs, fe_algo=fe_algo, mth=inner_opt_algorithm, ) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list()
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, time_limit=None, trial_num=0, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ super(ConditioningBlock, self).__init__(node_list, node_index, task_type, timestamp, fe_config_space, cash_config_space, data, fixed_config=fixed_config, time_limit=time_limit, trial_num=trial_num, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) # Best configuration. self.optimal_arm = None self.best_lower_bounds = None # Bandit settings. self.alpha = 4 self.arms = list( cash_config_space.get_hyperparameter('algorithm').choices) self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.arm_cost_stats = dict() for _arm in self.arms: self.arm_cost_stats[_arm] = list() for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() hps = cash_config_space.get_hyperparameters() cs = ConfigurationSpace() cs.add_hyperparameter(Constant('algorithm', arm)) for hp in hps: if hp.name.split(':')[0] == arm: cs.add_hyperparameter(hp) # Add active conditions conds = cash_config_space.get_conditions() for cond in conds: try: cs.add_condition(cond) except: pass # Add active forbidden clauses forbids = cash_config_space.get_forbiddens() for forbid in forbids: try: cs.add_forbidden_clause(forbid) except: pass from solnml.blocks.block_utils import get_node_type child_type = get_node_type(node_list, node_index + 1) self.sub_bandits[arm] = child_type( node_list, node_index + 1, task_type, timestamp, deepcopy(fe_config_space), deepcopy(cs), data.copy_(), fixed_config=fixed_config, time_limit=time_limit, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list() # Initialize the parameters. self.pull_cnt = 0 self.pick_id = 0 self.update_cnt = 0 arm_num = len(self.arms) self.optimal_algo_id = None self.arm_candidate = self.arms.copy() self.best_lower_bounds = np.zeros(arm_num) _iter_id = 0 if self.time_limit is None: if arm_num * self.alpha > self.trial_num: raise ValueError('Trial number should be larger than %d.' % (arm_num * self.alpha)) else: self.trial_num = MAX_INT
def __init__(self, task_type, estimator_id: str, data: DataNode, metric, share_fe=False, output_dir='logs', per_run_time_limit=120, per_run_mem_limit=5120, dataset_id='default', eval_type='holdout', mth='rb', sw_size=3, n_jobs=1, seed=1, fe_algo='tree_based', enable_intersection=True, number_of_unit_resource=2, total_resource=30): self.task_type = task_type self.metric = metric self.number_of_unit_resource = number_of_unit_resource # One unit of resource, that's, the number of trials per iteration. self.one_unit_of_resource = 5 self.total_resource = total_resource self.per_run_time_limit = per_run_time_limit self.per_run_mem_limit = per_run_mem_limit self.estimator_id = estimator_id self.evaluation_type = eval_type self.original_data = data.copy_() self.share_fe = share_fe self.output_dir = output_dir self.n_jobs = n_jobs self.mth = mth self.seed = seed self.sliding_window_size = sw_size task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id) self.logger = get_logger(self.__class__.__name__ + '-' + task_id) np.random.seed(self.seed) # Bandit settings. # self.arms = ['fe', 'hpo'] self.arms = ['hpo', 'fe'] self.rewards = dict() self.optimizer = dict() self.evaluation_cost = dict() self.update_flag = dict() # Global incumbent. self.inc = dict() self.local_inc = dict() self.local_hist = {'fe': [], 'hpo': []} for arm in self.arms: self.rewards[arm] = list() self.update_flag[arm] = False self.evaluation_cost[arm] = list() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() self.incumbent_perf = float("-INF") self.early_stopped_flag = False self.enable_intersection = enable_intersection # Fetch hyperparameter space. if self.task_type in CLS_TASKS: from solnml.components.models.classification import _classifiers, _addons if estimator_id in _classifiers: clf_class = _classifiers[estimator_id] elif estimator_id in _addons.components: clf_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) elif self.task_type in REG_TASKS: from solnml.components.models.regression import _regressors, _addons if estimator_id in _regressors: reg_class = _regressors[estimator_id] elif estimator_id in _addons.components: reg_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = reg_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) else: raise ValueError("Unknown task type %s!" % self.task_type) self.config_space = cs self.default_config = cs.get_default_configuration() self.config_space.seed(self.seed) # Build the Feature Engineering component. if self.task_type in CLS_TASKS: fe_evaluator = ClassificationEvaluator( self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = ClassificationEvaluator( self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) elif self.task_type in REG_TASKS: fe_evaluator = RegressionEvaluator( self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = RegressionEvaluator( self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) else: raise ValueError('Invalid task type!') self.fe_algo = fe_algo self.optimizer['fe'] = build_fe_optimizer(self.fe_algo, self.evaluation_type, self.task_type, self.original_data, fe_evaluator, estimator_id, per_run_time_limit, per_run_mem_limit, self.seed, shared_mode=self.share_fe, n_jobs=n_jobs) self.inc['fe'], self.local_inc[ 'fe'] = self.original_data, self.original_data # Build the HPO component. # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20) trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource self.optimizer['hpo'] = build_hpo_optimizer( self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, trials_per_iter=trials_per_iter, seed=self.seed, n_jobs=n_jobs) self.inc['hpo'], self.local_inc[ 'hpo'] = self.default_config, self.default_config self.init_config = cs.get_default_configuration() self.local_hist['fe'].append(self.original_data) self.local_hist['hpo'].append(self.default_config)
def get_data_node(self, X, y): if self.feature_types is None: raise ValueError("Feature type missing") return DataNode([X, y], self.feature_types, feature_names=self.feature_names)
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, time_limit=None, trial_num=0, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): super(AlternatingBlock, self).__init__(node_list, node_index, task_type, timestamp, fe_config_space, cash_config_space, data, fixed_config=fixed_config, time_limit=time_limit, trial_num=trial_num, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) self.arms = ['hpo', 'fe'] self.optimal_algo_id = None self.first_start = True self.sub_bandits = dict() self.rewards = dict() self.evaluation_cost = dict() self.update_flag = dict() # Global incumbent. self.init_config = { 'fe': fe_config_space.get_default_configuration().get_dictionary().copy( ), 'hpo': cash_config_space.get_default_configuration().get_dictionary(). copy() } self.inc = { 'fe': fe_config_space.get_default_configuration().get_dictionary().copy( ), 'hpo': cash_config_space.get_default_configuration().get_dictionary(). copy() } self.local_inc = { 'fe': fe_config_space.get_default_configuration().get_dictionary().copy( ), 'hpo': cash_config_space.get_default_configuration().get_dictionary(). copy() } self.local_hist = {'fe': [], 'hpo': []} self.inc_record = {'fe': list(), 'hpo': list()} self.exp_output = dict() self.eval_dict = dict() self.arm_eval_dict = {'fe': dict(), 'hpo': dict()} for arm in self.arms: self.rewards[arm] = list() self.update_flag[arm] = False self.evaluation_cost[arm] = list() self.exp_output[arm] = dict() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() for arm in self.arms: if arm == 'fe': from solnml.blocks.block_utils import get_node_type child_type = get_node_type(node_list, node_index + 1) self.sub_bandits[arm] = child_type( node_list, node_index + 1, task_type, timestamp, fe_config_space, None, data.copy_(), fixed_config=self.init_config['hpo'], time_limit=time_limit, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) else: from solnml.blocks.block_utils import get_node_type child_type = get_node_type(node_list, node_index + 2) self.sub_bandits[arm] = child_type( node_list, node_index + 2, task_type, timestamp, None, cash_config_space, data.copy_(), fixed_config=self.init_config['fe'], time_limit=time_limit, metric=metric, ensemble_method=ensemble_method, ensemble_size=ensemble_size, per_run_time_limit=per_run_time_limit, output_dir=output_dir, dataset_name=dataset_name, eval_type=eval_type, resampling_params=resampling_params, n_jobs=n_jobs, seed=seed) self.topk_saver = CombinedTopKModelSaver(k=50, model_dir=self.output_dir, identifier=self.timestamp)
import numpy as np import os import sys sys.path.append(os.getcwd()) from solnml.components.feature_engineering.transformations.preprocessor.text2vector import \ Text2VectorTransformation from solnml.components.feature_engineering.transformation_graph import DataNode from solnml.components.utils.constants import * from solnml.estimators import Classifier x = np.array([[1, 'I am good', 'I am right', 3], [2, 'He is good', 'He is ok', 4], [2.5, 'Everyone is good', 'Everyone is ok', 7], [1.3333, 'well', 'what', 5]]) y = np.array([0, 1, 0, 1]) t2v = Text2VectorTransformation() data = (x, y) feature_type = [NUMERICAL, TEXT, TEXT, DISCRETE] datanode = DataNode(data, feature_type) clf = Classifier(time_limit=20, enable_meta_algorithm_selection=False, include_algorithms=['random_forest']) clf.fit(datanode, opt_strategy='combined') print(clf.predict(datanode))