Esempio n. 1
0
 def __init__(self, evaluator: Evaluator, config_space, seed=None):
     self.evaluator = evaluator
     self.config_space = config_space
     self.seed = np.random.random_integers(MAX_INT) if seed is None else seed
     self.start_time = time.time()
     self.timing_list = list()
     self.incumbent = None
     self.logger = get_logger(__class__.__name__)
Esempio n. 2
0
 def _get_logger(self, name):
     import os
     logger_name = 'AutomlToolkit-%s-%d:%s' % (self.task_id, self._seed,
                                               name)
     setup_logger(
         os.path.join(self.tmp_directory, '%s.log' % str(logger_name)),
         self.logging_config,
     )
     return get_logger(logger_name)
Esempio n. 3
0
 def __init__(self, name, datanode, seed=1):
     self.name = name
     self._seed = seed
     self.incumbent = datanode
     self.root_node = datanode
     self.graph = TransformationGraph()
     self.graph.add_node(self.root_node)
     self.time_budget = None
     self.maximum_evaluation_num = None
     logger_name = '%s(%d)' % (self.name, self._seed)
     self.logger = get_logger(logger_name)
Esempio n. 4
0
 def __init__(self,
              clf_config,
              data_node=None,
              name=None,
              resampling_strategy='cv',
              cv=5,
              seed=1):
     self.clf_config = clf_config
     self.data_node = data_node
     self.name = name
     self.resampling_strategy = resampling_strategy
     self.cv = cv
     self.seed = seed
     self.eval_id = 0
     self.logger = get_logger('Evaluator-%s' % self.name)
Esempio n. 5
0
 def __init__(self,
              reg_config,
              scorer=None,
              data_node=None,
              name=None,
              resampling_strategy='holdout',
              resampling_params=None,
              seed=1,
              estimator=None):
     self.reg_config = reg_config
     self.scorer = scorer
     self.data_node = data_node
     self.name = name
     self.estimator = estimator
     self.resampling_strategy = resampling_strategy
     self.resampling_params = resampling_params
     self.seed = seed
     self.eval_id = 0
     self.logger = get_logger('RegressionEvaluator-%s' % self.name)
Esempio n. 6
0
 def __init__(self,
              clf_config,
              scorer=None,
              data_node=None,
              name=None,
              resampling_strategy='cv',
              resampling_params=None,
              seed=1):
     self.resampling_strategy = resampling_strategy
     self.resampling_params = resampling_params
     self.clf_config = clf_config
     self.scorer = scorer if scorer is not None else balanced_accuracy_scorer
     self.data_node = data_node
     self.name = name
     self.seed = seed
     self.eval_id = 0
     self.logger = get_logger('Evaluator-%s' % self.name)
     self.init_params = None
     self.fit_params = None
 def _get_logger(self, name):
     logger_name = 'AutomlToolkit_%s' % name
     setup_logger(os.path.join(self.tmp_directory, '%s.log' % str(logger_name)),
                  self.logging_config,
                  )
     return get_logger(logger_name)
Esempio n. 8
0
def calculate_all_metafeatures(X,
                               y,
                               categorical,
                               dataset_name,
                               task_type,
                               calculate=None,
                               dont_calculate=None,
                               densify_threshold=1000):
    logger = get_logger(__name__)
    """Calculate all metafeatures."""
    helper_functions.clear()
    metafeatures.clear()
    mf_ = dict()

    visited = set()
    to_visit = deque()
    to_visit.extend(metafeatures)

    X_transformed = None
    y_transformed = None

    func_cls = [
        'NumberOfClasses', 'LogNumberOfFeatures', 'ClassProbabilityMin',
        'ClassProbabilityMax', 'ClassProbabilityMean', "ClassProbabilitySTD",
        'ClassEntropy', 'LandmarkLDA', 'LandmarkNaiveBayes',
        'LandmarkDecisionTree', 'LandmarkDecisionNodeLearner',
        'LandmarkRandomNodeLearner', 'LandmarkWorstNodeLearner', 'Landmark1NN'
    ]

    # TODO calculate the numpy metafeatures after all others to consume less
    # memory
    while len(to_visit) > 0:
        name = to_visit.pop()
        if calculate is not None and name not in calculate:
            continue
        if dont_calculate is not None and name in dont_calculate:
            continue
        if name in func_cls and task_type not in CLS_TASKS:
            continue

        if name in npy_metafeatures:
            if X_transformed is None:
                # TODO make sure this is done as efficient as possible (no copy for
                # sparse matrices because of wrong sparse format)
                sparse = scipy.sparse.issparse(X)
                if any(categorical):
                    ohe = OneHotEncoder(categorical_features=categorical,
                                        sparse=True)
                    X_transformed = ohe.fit_transform(X)
                else:
                    X_transformed = X
                imputer = SimpleImputer(strategy='mean', copy=False)
                X_transformed = imputer.fit_transform(X_transformed)
                center = not scipy.sparse.isspmatrix(X_transformed)
                standard_scaler = StandardScaler(copy=False, with_mean=center)
                X_transformed = standard_scaler.fit_transform(X_transformed)
                categorical_transformed = [False] * X_transformed.shape[1]

                # Densify the transformed matrix
                if not sparse and scipy.sparse.issparse(X_transformed):
                    bytes_per_float = X_transformed.dtype.itemsize
                    num_elements = X_transformed.shape[
                        0] * X_transformed.shape[1]
                    megabytes_required = num_elements * bytes_per_float / 1000 / 1000
                    if megabytes_required < densify_threshold:
                        X_transformed = X_transformed.todense()

                # This is not only important for datasets which are somehow
                # sorted in a strange way, but also prevents lda from failing in
                # some cases.
                # Because this is advanced indexing, a copy of the data is returned!!!
                X_transformed = check_array(X_transformed,
                                            force_all_finite=True,
                                            accept_sparse='csr')
                rs = np.random.RandomState(42)
                indices = np.arange(X_transformed.shape[0])
                rs.shuffle(indices)
                # TODO Shuffle inplace
                X_transformed = X_transformed[indices]
                y_transformed = y[indices]

            X_ = X_transformed
            y_ = y_transformed
            categorical_ = categorical_transformed
        else:
            X_ = X
            y_ = y
            categorical_ = categorical

        dependency = metafeatures.get_dependency(name)
        if dependency is not None:
            is_metafeature = dependency in metafeatures
            is_helper_function = dependency in helper_functions

            if is_metafeature and is_helper_function:
                raise NotImplementedError()
            elif not is_metafeature and not is_helper_function:
                raise ValueError(dependency)
            elif is_metafeature and not metafeatures.is_calculated(dependency):
                to_visit.appendleft(name)
                continue
            elif is_helper_function and not helper_functions.is_calculated(
                    dependency):
                logger.info("%s: Going to calculate: %s", dataset_name,
                            dependency)
                value = helper_functions[dependency](X_, y_, categorical_)
                helper_functions.set_value(dependency, value)
                mf_[dependency] = value

        logger.info("%s: Going to calculate: %s", dataset_name, name)

        value = metafeatures[name](X_, y_, categorical_)
        metafeatures.set_value(name, value)
        mf_[name] = value
        visited.add(name)

    mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type)
    return mf_
Esempio n. 9
0
 def __init__(self):
     self.logger = get_logger(__name__)
Esempio n. 10
0
    def __init__(self, classifier_id: str, data: DataNode,
                 share_fe=False, output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 eval_type='cv', dataset_id='default',
                 mth='rb', sw_size=3, strategy='avg',
                 n_jobs=1, seed=1):
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.classifier_id = classifier_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.mth = mth
        self.strategy = strategy
        self.seed = seed
        self.sliding_window_size = sw_size
        self.logger = get_logger('%s:%s-%d=>%s' % (__class__.__name__, dataset_id, seed, classifier_id))
        np.random.seed(self.seed)

        # Bandit settings.
        self.arms = ['fe', 'hpo']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.inc = dict()
        self.local_inc = dict()
        for arm in self.arms:
            self.rewards[arm] = list()
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = -1.
        self.incumbent_source = None
        self.update_flag = dict()
        self.imp_rewards = dict()
        for arm in self.arms:
            self.update_flag[arm] = True
            self.imp_rewards[arm] = list()

        from autosklearn.pipeline.components.classification import _classifiers
        clf_class = _classifiers[classifier_id]
        cs = clf_class.get_hyperparameter_search_space()
        model = UnParametrizedHyperparameter("estimator", classifier_id)
        cs.add_hyperparameter(model)
        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        fe_evaluator = Evaluator(self.default_config,
                                 name='fe', resampling_strategy=self.evaluation_type,
                                 seed=self.seed)
        self.optimizer['fe'] = EvaluationBasedOptimizer(
                self.original_data, fe_evaluator,
                classifier_id, per_run_time_limit, per_run_mem_limit, self.seed,
                shared_mode=self.share_fe, n_jobs=n_jobs)
        self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data

        # Build the HPO component.
        trials_per_iter = len(self.optimizer['fe'].trans_types)
        hpo_evaluator = Evaluator(self.default_config,
                                  data_node=self.original_data, name='hpo',
                                  resampling_strategy=self.evaluation_type,
                                  seed=self.seed)
        if n_jobs == 1:
            self.optimizer['hpo'] = SMACOptimizer(
                hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit,
                trials_per_iter=trials_per_iter // 2, seed=self.seed)
        else:
            self.optimizer['hpo'] = PSMACOptimizer(
                hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit,
                trials_per_iter=trials_per_iter // 2, seed=self.seed,
                n_jobs=n_jobs
            )
        self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config
Esempio n. 11
0
    def __init__(self, task_type, estimator_id: str, data: DataNode, metric,
                 share_fe=False, output_dir='logs',
                 per_run_time_limit=120,
                 per_run_mem_limit=5120,
                 dataset_id='default',
                 eval_type='holdout',
                 mth='rb', sw_size=3,
                 n_jobs=1, seed=1,
                 enable_intersection=True,
                 number_of_unit_resource=2):
        self.task_type = task_type
        self.metric = metric
        self.number_of_unit_resource = number_of_unit_resource
        # One unit of resource, that's, the number of trials per iteration.
        self.one_unit_of_resource = 5
        self.per_run_time_limit = per_run_time_limit
        self.per_run_mem_limit = per_run_mem_limit
        self.estimator_id = estimator_id
        self.evaluation_type = eval_type
        self.original_data = data.copy_()
        self.share_fe = share_fe
        self.output_dir = output_dir
        self.mth = mth
        self.seed = seed
        self.n_jobs = n_jobs
        self.sliding_window_size = sw_size
        self.logger = get_logger('%s:%s-%d=>%s' % (
            __class__.__name__, dataset_id, seed, estimator_id))
        np.random.seed(self.seed)

        # Bandit settings.
        self.arms = ['fe', 'hpo']
        self.rewards = dict()
        self.optimizer = dict()
        self.evaluation_cost = dict()
        self.update_flag = dict()
        # Global incumbent.
        self.inc = dict()
        self.local_inc = dict()
        self.local_hist = {'fe': [], 'hpo': []}
        for arm in self.arms:
            self.rewards[arm] = list()
            self.update_flag[arm] = False
            self.evaluation_cost[arm] = list()
        self.pull_cnt = 0
        self.action_sequence = list()
        self.final_rewards = list()
        self.incumbent_perf = float("-INF")
        self.early_stopped_flag = False
        self.enable_intersection = enable_intersection

        # Fetch hyperparameter space.
        if self.task_type in CLS_TASKS:
            from automlToolkit.components.models.classification import _classifiers, _addons
            if estimator_id in _classifiers:
                clf_class = _classifiers[estimator_id]
            elif estimator_id in _addons.components:
                clf_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = clf_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        elif self.task_type in REG_TASKS:
            from automlToolkit.components.models.regression import _regressors, _addons
            if estimator_id in _regressors:
                reg_class = _regressors[estimator_id]
            elif estimator_id in _addons.components:
                reg_class = _addons.components[estimator_id]
            else:
                raise ValueError("Algorithm %s not supported!" % estimator_id)
            cs = reg_class.get_hyperparameter_search_space()
            model = UnParametrizedHyperparameter("estimator", estimator_id)
            cs.add_hyperparameter(model)
        else:
            raise ValueError("Unknown task type %s!" % self.task_type)

        self.config_space = cs
        self.default_config = cs.get_default_configuration()
        self.config_space.seed(self.seed)

        # Build the Feature Engineering component.
        if self.task_type in CLS_TASKS:
            fe_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric,
                                                   name='fe', resampling_strategy=self.evaluation_type,
                                                   seed=self.seed)
            hpo_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric,
                                                    data_node=self.original_data, name='hpo',
                                                    resampling_strategy=self.evaluation_type,
                                                    seed=self.seed)
        elif self.task_type in REG_TASKS:
            fe_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric,
                                               name='fe', resampling_strategy=self.evaluation_type,
                                               seed=self.seed)
            hpo_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric,
                                                data_node=self.original_data, name='hpo',
                                                resampling_strategy=self.evaluation_type,
                                                seed=self.seed)
        else:
            raise ValueError('Invalid task type!')

        self.optimizer['fe'] = build_fe_optimizer(self.evaluation_type, self.task_type, self.original_data,
                                                  fe_evaluator, estimator_id, per_run_time_limit,
                                                  per_run_mem_limit, self.seed,
                                                  shared_mode=self.share_fe, n_jobs=n_jobs)

        self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data

        # Build the HPO component.
        # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20)
        trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource

        self.optimizer['hpo'] = build_hpo_optimizer(self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir,
                                                    per_run_time_limit=per_run_time_limit,
                                                    trials_per_iter=trials_per_iter,
                                                    seed=self.seed, n_jobs=n_jobs)

        self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config
        self.local_hist['fe'].append(self.original_data)
        self.local_hist['hpo'].append(self.default_config)