Example #1
0
    def _parse(self,
               data_node: DataNode,
               config,
               record=False,
               skip_balance=False):
        """
            Transform the data node based on the pipeline specified by configuration.
        :param data_node:
        :param config:
        :param record:
        :return: the resulting data node.
        """
        # Remove the indicator in config_dict.
        config_dict = config.get_dictionary().copy()
        pre1_id = config_dict['preprocessor1']
        config_dict.pop('preprocessor1')
        pre2_id = config_dict['preprocessor2']
        config_dict.pop('preprocessor2')
        if skip_balance:
            bal_id = 'empty'
        else:
            if 'balancer' in config_dict:
                bal_id = config_dict['balancer']
                config_dict.pop('balancer')
            else:
                bal_id = 'empty'
        gen_id = config_dict['generator']
        config_dict.pop('generator')
        res_id = config_dict['rescaler']
        config_dict.pop('rescaler')
        sel_id = config_dict['selector']
        config_dict.pop('selector')

        def tran_operate(id, tran_set, config, node):
            if id != "empty":
                _config = {}
                for key in config:
                    if id in key:
                        config_name = key.split(':')[1]
                        _config[config_name] = config[key]
                tran = tran_set[id](**_config)
                output_node = tran.operate(node)
                return output_node, tran
            return node, None

        _node = data_node.copy_()
        # Preprocessor1
        _node, pre1_tran = tran_operate(pre1_id, _preprocessor1, config_dict,
                                        _node)

        # Preprocessor2
        _node, pre2_tran = tran_operate(pre2_id, _preprocessor2, config_dict,
                                        _node)

        # Balancer
        _node, bal_tran = tran_operate(bal_id, _balancer, config_dict, _node)

        # Rescaler
        _node, res_tran = tran_operate(res_id, _rescaler, config_dict, _node)

        # Generator
        _node, gen_tran = tran_operate(gen_id, _generator, config_dict, _node)

        # Selector
        _node, sel_tran = tran_operate(sel_id, _selector, config_dict, _node)

        if record:
            return _node, [
                pre1_tran, pre2_tran, bal_tran, res_tran, gen_tran, sel_tran
            ]
        return _node
    def __init__(self, task_type, trial_num,
                 classifier_ids: List[str], data: DataNode,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=10,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 share_feature=False,
                 inner_opt_algorithm='rb',
                 fe_algo='bo',
                 time_limit=None,
                 n_jobs=1,
                 seed=1):
        """
        :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting',
        'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda',
        'random_forest','sgd'}
        """
        self.timestamp = time.time()
        self.task_type = task_type
        self.metric = get_metric(metric)
        self.original_data = data.copy_()
        self.ensemble_method = ensemble_method
        self.ensemble_size = ensemble_size
        self.trial_num = trial_num
        self.n_jobs = n_jobs
        self.alpha = 4
        self.B = 0.01
        self.seed = seed
        self.shared_mode = share_feature
        self.output_dir = output_dir
        np.random.seed(self.seed)

        # Best configuration.
        self.optimal_algo_id = None
        self.nbest_algo_ids = None
        self.best_lower_bounds = None
        self.es = None

        # Set up backend.
        self.dataset_name = dataset_name
        self.time_limit = time_limit
        self.start_time = time.time()
        self.logger = get_logger('Soln-ml: %s' % dataset_name)

        # Bandit settings.
        self.incumbent_perf = -float("INF")
        self.arms = classifier_ids
        self.include_algorithms = classifier_ids
        self.rewards = dict()
        self.sub_bandits = dict()
        self.evaluation_cost = dict()
        self.fe_datanodes = dict()
        self.eval_type = eval_type
        self.fe_algo = fe_algo
        self.inner_opt_algorithm = inner_opt_algorithm
        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
            self.fe_datanodes[arm] = list()
            self.sub_bandits[arm] = SecondLayerBandit(
                self.task_type, arm, self.original_data,
                metric=self.metric,
                output_dir=output_dir,
                per_run_time_limit=per_run_time_limit,
                share_fe=self.shared_mode,
                seed=self.seed,
                eval_type=eval_type,
                dataset_id=dataset_name,
                n_jobs=self.n_jobs,
                fe_algo=fe_algo,
                mth=inner_opt_algorithm,
            )

        self.action_sequence = list()
        self.final_rewards = list()
        self.start_time = time.time()
        self.time_records = list()
Example #3
0
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 time_limit=None,
                 trial_num=0,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        """
        :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting',
        'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda',
        'random_forest','sgd'}
        """
        super(ConditioningBlock,
              self).__init__(node_list,
                             node_index,
                             task_type,
                             timestamp,
                             fe_config_space,
                             cash_config_space,
                             data,
                             fixed_config=fixed_config,
                             time_limit=time_limit,
                             trial_num=trial_num,
                             metric=metric,
                             ensemble_method=ensemble_method,
                             ensemble_size=ensemble_size,
                             per_run_time_limit=per_run_time_limit,
                             output_dir=output_dir,
                             dataset_name=dataset_name,
                             eval_type=eval_type,
                             resampling_params=resampling_params,
                             n_jobs=n_jobs,
                             seed=seed)

        # Best configuration.
        self.optimal_arm = None
        self.best_lower_bounds = None

        # Bandit settings.
        self.alpha = 4
        self.arms = list(
            cash_config_space.get_hyperparameter('algorithm').choices)
        self.rewards = dict()
        self.sub_bandits = dict()
        self.evaluation_cost = dict()

        self.arm_cost_stats = dict()
        for _arm in self.arms:
            self.arm_cost_stats[_arm] = list()

        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()

            hps = cash_config_space.get_hyperparameters()
            cs = ConfigurationSpace()
            cs.add_hyperparameter(Constant('algorithm', arm))
            for hp in hps:
                if hp.name.split(':')[0] == arm:
                    cs.add_hyperparameter(hp)

            # Add active conditions
            conds = cash_config_space.get_conditions()
            for cond in conds:
                try:
                    cs.add_condition(cond)
                except:
                    pass

            # Add active forbidden clauses
            forbids = cash_config_space.get_forbiddens()
            for forbid in forbids:
                try:
                    cs.add_forbidden_clause(forbid)
                except:
                    pass

            from solnml.blocks.block_utils import get_node_type
            child_type = get_node_type(node_list, node_index + 1)
            self.sub_bandits[arm] = child_type(
                node_list,
                node_index + 1,
                task_type,
                timestamp,
                deepcopy(fe_config_space),
                deepcopy(cs),
                data.copy_(),
                fixed_config=fixed_config,
                time_limit=time_limit,
                metric=metric,
                ensemble_method=ensemble_method,
                ensemble_size=ensemble_size,
                per_run_time_limit=per_run_time_limit,
                output_dir=output_dir,
                dataset_name=dataset_name,
                eval_type=eval_type,
                resampling_params=resampling_params,
                n_jobs=n_jobs,
                seed=seed)

        self.action_sequence = list()
        self.final_rewards = list()
        self.start_time = time.time()
        self.time_records = list()

        # Initialize the parameters.
        self.pull_cnt = 0
        self.pick_id = 0
        self.update_cnt = 0
        arm_num = len(self.arms)
        self.optimal_algo_id = None
        self.arm_candidate = self.arms.copy()
        self.best_lower_bounds = np.zeros(arm_num)
        _iter_id = 0
        if self.time_limit is None:
            if arm_num * self.alpha > self.trial_num:
                raise ValueError('Trial number should be larger than %d.' %
                                 (arm_num * self.alpha))
        else:
            self.trial_num = MAX_INT
Example #4
0
    def __init__(self,
                 task_type,
                 estimator_id: str,
                 data: DataNode,
                 metric,
                 share_fe=False,
                 output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb',
                 sw_size=3,
                 n_jobs=1,
                 seed=1,
                 fe_algo='tree_based',
                 enable_intersection=True,
                 number_of_unit_resource=2,
                 total_resource=30):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.total_resource = total_resource
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.n_jobs = n_jobs
        self.mth = mth
        self.seed = seed
        self.sliding_window_size = sw_size
        task_id = '%s-%d-%s' % (dataset_id, seed, estimator_id)
        self.logger = get_logger(self.__class__.__name__ + '-' + task_id)
        np.random.seed(self.seed)

        # Bandit settings.
        # self.arms = ['fe', 'hpo']
        self.arms = ['hpo', 'fe']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.enable_intersection = enable_intersection

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from solnml.components.models.classification import _classifiers, _addons
            if estimator_id in _classifiers:
                clf_class = _classifiers[estimator_id]
            elif estimator_id in _addons.components:
                clf_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        elif self.task_type in REG_TASKS:
            from solnml.components.models.regression import _regressors, _addons
            if estimator_id in _regressors:
                reg_class = _regressors[estimator_id]
            elif estimator_id in _addons.components:
                reg_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = ClassificationEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        elif self.task_type in REG_TASKS:
            fe_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                name='fe',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
            hpo_evaluator = RegressionEvaluator(
                self.default_config,
                scorer=self.metric,
                data_node=self.original_data,
                name='hpo',
                resampling_strategy=self.evaluation_type,
                seed=self.seed)
        else:
            raise ValueError('Invalid task type!')

        self.fe_algo = fe_algo
        self.optimizer['fe'] = build_fe_optimizer(self.fe_algo,
                                                  self.evaluation_type,
                                                  self.task_type,
                                                  self.original_data,
                                                  fe_evaluator,
                                                  estimator_id,
                                                  per_run_time_limit,
                                                  per_run_mem_limit,
                                                  self.seed,
                                                  shared_mode=self.share_fe,
                                                  n_jobs=n_jobs)

        self.inc['fe'], self.local_inc[
            'fe'] = self.original_data, self.original_data

        # Build the HPO component.
        # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

        self.optimizer['hpo'] = build_hpo_optimizer(
            self.evaluation_type,
            hpo_evaluator,
            cs,
            output_dir=output_dir,
            per_run_time_limit=per_run_time_limit,
            trials_per_iter=trials_per_iter,
            seed=self.seed,
            n_jobs=n_jobs)

        self.inc['hpo'], self.local_inc[
            'hpo'] = self.default_config, self.default_config
        self.init_config = cs.get_default_configuration()
        self.local_hist['fe'].append(self.original_data)
        self.local_hist['hpo'].append(self.default_config)
Example #5
0
 def get_data_node(self, X, y):
     if self.feature_types is None:
         raise ValueError("Feature type missing")
     return DataNode([X, y],
                     self.feature_types,
                     feature_names=self.feature_names)
Example #6
0
    def __init__(self,
                 node_list,
                 node_index,
                 task_type,
                 timestamp,
                 fe_config_space: ConfigurationSpace,
                 cash_config_space: ConfigurationSpace,
                 data: DataNode,
                 fixed_config=None,
                 time_limit=None,
                 trial_num=0,
                 metric='acc',
                 ensemble_method='ensemble_selection',
                 ensemble_size=50,
                 per_run_time_limit=300,
                 output_dir="logs",
                 dataset_name='default_dataset',
                 eval_type='holdout',
                 resampling_params=None,
                 n_jobs=1,
                 seed=1):
        super(AlternatingBlock,
              self).__init__(node_list,
                             node_index,
                             task_type,
                             timestamp,
                             fe_config_space,
                             cash_config_space,
                             data,
                             fixed_config=fixed_config,
                             time_limit=time_limit,
                             trial_num=trial_num,
                             metric=metric,
                             ensemble_method=ensemble_method,
                             ensemble_size=ensemble_size,
                             per_run_time_limit=per_run_time_limit,
                             output_dir=output_dir,
                             dataset_name=dataset_name,
                             eval_type=eval_type,
                             resampling_params=resampling_params,
                             n_jobs=n_jobs,
                             seed=seed)

        self.arms = ['hpo', 'fe']
        self.optimal_algo_id = None
        self.first_start = True
        self.sub_bandits = dict()
        self.rewards = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()

        # Global incumbent.
        self.init_config = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.inc = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.local_inc = {
            'fe':
            fe_config_space.get_default_configuration().get_dictionary().copy(
            ),
            'hpo':
            cash_config_space.get_default_configuration().get_dictionary().
            copy()
        }
        self.local_hist = {'fe': [], 'hpo': []}
        self.inc_record = {'fe': list(), 'hpo': list()}
        self.exp_output = dict()
        self.eval_dict = dict()
        self.arm_eval_dict = {'fe': dict(), 'hpo': dict()}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
            self.exp_output[arm] = dict()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()

        for arm in self.arms:
            if arm == 'fe':
                from solnml.blocks.block_utils import get_node_type
                child_type = get_node_type(node_list, node_index + 1)
                self.sub_bandits[arm] = child_type(
                    node_list,
                    node_index + 1,
                    task_type,
                    timestamp,
                    fe_config_space,
                    None,
                    data.copy_(),
                    fixed_config=self.init_config['hpo'],
                    time_limit=time_limit,
                    metric=metric,
                    ensemble_method=ensemble_method,
                    ensemble_size=ensemble_size,
                    per_run_time_limit=per_run_time_limit,
                    output_dir=output_dir,
                    dataset_name=dataset_name,
                    eval_type=eval_type,
                    resampling_params=resampling_params,
                    n_jobs=n_jobs,
                    seed=seed)
            else:
                from solnml.blocks.block_utils import get_node_type
                child_type = get_node_type(node_list, node_index + 2)
                self.sub_bandits[arm] = child_type(
                    node_list,
                    node_index + 2,
                    task_type,
                    timestamp,
                    None,
                    cash_config_space,
                    data.copy_(),
                    fixed_config=self.init_config['fe'],
                    time_limit=time_limit,
                    metric=metric,
                    ensemble_method=ensemble_method,
                    ensemble_size=ensemble_size,
                    per_run_time_limit=per_run_time_limit,
                    output_dir=output_dir,
                    dataset_name=dataset_name,
                    eval_type=eval_type,
                    resampling_params=resampling_params,
                    n_jobs=n_jobs,
                    seed=seed)

        self.topk_saver = CombinedTopKModelSaver(k=50,
                                                 model_dir=self.output_dir,
                                                 identifier=self.timestamp)
Example #7
0
import numpy as np
import os
import sys

sys.path.append(os.getcwd())

from solnml.components.feature_engineering.transformations.preprocessor.text2vector import \
    Text2VectorTransformation
from solnml.components.feature_engineering.transformation_graph import DataNode
from solnml.components.utils.constants import *
from solnml.estimators import Classifier

x = np.array([[1, 'I am good', 'I am right', 3],
              [2, 'He is good', 'He is ok', 4],
              [2.5, 'Everyone is good', 'Everyone is ok', 7],
              [1.3333, 'well', 'what', 5]])
y = np.array([0, 1, 0, 1])

t2v = Text2VectorTransformation()
data = (x, y)
feature_type = [NUMERICAL, TEXT, TEXT, DISCRETE]
datanode = DataNode(data, feature_type)

clf = Classifier(time_limit=20,
                 enable_meta_algorithm_selection=False,
                 include_algorithms=['random_forest'])

clf.fit(datanode, opt_strategy='combined')
print(clf.predict(datanode))