def __init__(self, evaluator: Evaluator, config_space, seed=None): self.evaluator = evaluator self.config_space = config_space self.seed = np.random.random_integers(MAX_INT) if seed is None else seed self.start_time = time.time() self.timing_list = list() self.incumbent = None self.logger = get_logger(__class__.__name__)
def _get_logger(self, name): import os logger_name = 'AutomlToolkit-%s-%d:%s' % (self.task_id, self._seed, name) setup_logger( os.path.join(self.tmp_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def __init__(self, name, datanode, seed=1): self.name = name self._seed = seed self.incumbent = datanode self.root_node = datanode self.graph = TransformationGraph() self.graph.add_node(self.root_node) self.time_budget = None self.maximum_evaluation_num = None logger_name = '%s(%d)' % (self.name, self._seed) self.logger = get_logger(logger_name)
def __init__(self, clf_config, data_node=None, name=None, resampling_strategy='cv', cv=5, seed=1): self.clf_config = clf_config self.data_node = data_node self.name = name self.resampling_strategy = resampling_strategy self.cv = cv self.seed = seed self.eval_id = 0 self.logger = get_logger('Evaluator-%s' % self.name)
def __init__(self, reg_config, scorer=None, data_node=None, name=None, resampling_strategy='holdout', resampling_params=None, seed=1, estimator=None): self.reg_config = reg_config self.scorer = scorer self.data_node = data_node self.name = name self.estimator = estimator self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.seed = seed self.eval_id = 0 self.logger = get_logger('RegressionEvaluator-%s' % self.name)
def __init__(self, clf_config, scorer=None, data_node=None, name=None, resampling_strategy='cv', resampling_params=None, seed=1): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.clf_config = clf_config self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.data_node = data_node self.name = name self.seed = seed self.eval_id = 0 self.logger = get_logger('Evaluator-%s' % self.name) self.init_params = None self.fit_params = None
def _get_logger(self, name): logger_name = 'AutomlToolkit_%s' % name setup_logger(os.path.join(self.tmp_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def calculate_all_metafeatures(X, y, categorical, dataset_name, task_type, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None func_cls = [ 'NumberOfClasses', 'LogNumberOfFeatures', 'ClassProbabilityMin', 'ClassProbabilityMax', 'ClassProbabilityMean', "ClassProbabilitySTD", 'ClassEntropy', 'LandmarkLDA', 'LandmarkNaiveBayes', 'LandmarkDecisionTree', 'LandmarkDecisionNodeLearner', 'LandmarkRandomNodeLearner', 'LandmarkWorstNodeLearner', 'Landmark1NN' ] # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in func_cls and task_type not in CLS_TASKS: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) if any(categorical): ohe = OneHotEncoder(categorical_features=categorical, sparse=True) X_transformed = ohe.fit_transform(X) else: X_transformed = X imputer = SimpleImputer(strategy='mean', copy=False) X_transformed = imputer.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix(X_transformed) standard_scaler = StandardScaler(copy=False, with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) categorical_transformed = [False] * X_transformed.shape[1] # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.info("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.info("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type) return mf_
def __init__(self): self.logger = get_logger(__name__)
def __init__(self, classifier_id: str, data: DataNode, share_fe=False, output_dir='logs', per_run_time_limit=120, per_run_mem_limit=5120, eval_type='cv', dataset_id='default', mth='rb', sw_size=3, strategy='avg', n_jobs=1, seed=1): self.per_run_time_limit = per_run_time_limit self.per_run_mem_limit = per_run_mem_limit self.classifier_id = classifier_id self.evaluation_type = eval_type self.original_data = data.copy_() self.share_fe = share_fe self.output_dir = output_dir self.mth = mth self.strategy = strategy self.seed = seed self.sliding_window_size = sw_size self.logger = get_logger('%s:%s-%d=>%s' % (__class__.__name__, dataset_id, seed, classifier_id)) np.random.seed(self.seed) # Bandit settings. self.arms = ['fe', 'hpo'] self.rewards = dict() self.optimizer = dict() self.evaluation_cost = dict() self.inc = dict() self.local_inc = dict() for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() self.incumbent_perf = -1. self.incumbent_source = None self.update_flag = dict() self.imp_rewards = dict() for arm in self.arms: self.update_flag[arm] = True self.imp_rewards[arm] = list() from autosklearn.pipeline.components.classification import _classifiers clf_class = _classifiers[classifier_id] cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", classifier_id) cs.add_hyperparameter(model) self.config_space = cs self.default_config = cs.get_default_configuration() self.config_space.seed(self.seed) # Build the Feature Engineering component. fe_evaluator = Evaluator(self.default_config, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) self.optimizer['fe'] = EvaluationBasedOptimizer( self.original_data, fe_evaluator, classifier_id, per_run_time_limit, per_run_mem_limit, self.seed, shared_mode=self.share_fe, n_jobs=n_jobs) self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data # Build the HPO component. trials_per_iter = len(self.optimizer['fe'].trans_types) hpo_evaluator = Evaluator(self.default_config, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) if n_jobs == 1: self.optimizer['hpo'] = SMACOptimizer( hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, trials_per_iter=trials_per_iter // 2, seed=self.seed) else: self.optimizer['hpo'] = PSMACOptimizer( hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, trials_per_iter=trials_per_iter // 2, seed=self.seed, n_jobs=n_jobs ) self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config
def __init__(self, task_type, estimator_id: str, data: DataNode, metric, share_fe=False, output_dir='logs', per_run_time_limit=120, per_run_mem_limit=5120, dataset_id='default', eval_type='holdout', mth='rb', sw_size=3, n_jobs=1, seed=1, enable_intersection=True, number_of_unit_resource=2): self.task_type = task_type self.metric = metric self.number_of_unit_resource = number_of_unit_resource # One unit of resource, that's, the number of trials per iteration. self.one_unit_of_resource = 5 self.per_run_time_limit = per_run_time_limit self.per_run_mem_limit = per_run_mem_limit self.estimator_id = estimator_id self.evaluation_type = eval_type self.original_data = data.copy_() self.share_fe = share_fe self.output_dir = output_dir self.mth = mth self.seed = seed self.n_jobs = n_jobs self.sliding_window_size = sw_size self.logger = get_logger('%s:%s-%d=>%s' % ( __class__.__name__, dataset_id, seed, estimator_id)) np.random.seed(self.seed) # Bandit settings. self.arms = ['fe', 'hpo'] self.rewards = dict() self.optimizer = dict() self.evaluation_cost = dict() self.update_flag = dict() # Global incumbent. self.inc = dict() self.local_inc = dict() self.local_hist = {'fe': [], 'hpo': []} for arm in self.arms: self.rewards[arm] = list() self.update_flag[arm] = False self.evaluation_cost[arm] = list() self.pull_cnt = 0 self.action_sequence = list() self.final_rewards = list() self.incumbent_perf = float("-INF") self.early_stopped_flag = False self.enable_intersection = enable_intersection # Fetch hyperparameter space. if self.task_type in CLS_TASKS: from automlToolkit.components.models.classification import _classifiers, _addons if estimator_id in _classifiers: clf_class = _classifiers[estimator_id] elif estimator_id in _addons.components: clf_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = clf_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) elif self.task_type in REG_TASKS: from automlToolkit.components.models.regression import _regressors, _addons if estimator_id in _regressors: reg_class = _regressors[estimator_id] elif estimator_id in _addons.components: reg_class = _addons.components[estimator_id] else: raise ValueError("Algorithm %s not supported!" % estimator_id) cs = reg_class.get_hyperparameter_search_space() model = UnParametrizedHyperparameter("estimator", estimator_id) cs.add_hyperparameter(model) else: raise ValueError("Unknown task type %s!" % self.task_type) self.config_space = cs self.default_config = cs.get_default_configuration() self.config_space.seed(self.seed) # Build the Feature Engineering component. if self.task_type in CLS_TASKS: fe_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = ClassificationEvaluator(self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) elif self.task_type in REG_TASKS: fe_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric, name='fe', resampling_strategy=self.evaluation_type, seed=self.seed) hpo_evaluator = RegressionEvaluator(self.default_config, scorer=self.metric, data_node=self.original_data, name='hpo', resampling_strategy=self.evaluation_type, seed=self.seed) else: raise ValueError('Invalid task type!') self.optimizer['fe'] = build_fe_optimizer(self.evaluation_type, self.task_type, self.original_data, fe_evaluator, estimator_id, per_run_time_limit, per_run_mem_limit, self.seed, shared_mode=self.share_fe, n_jobs=n_jobs) self.inc['fe'], self.local_inc['fe'] = self.original_data, self.original_data # Build the HPO component. # trials_per_iter = max(len(self.optimizer['fe'].trans_types), 20) trials_per_iter = self.one_unit_of_resource * self.number_of_unit_resource self.optimizer['hpo'] = build_hpo_optimizer(self.evaluation_type, hpo_evaluator, cs, output_dir=output_dir, per_run_time_limit=per_run_time_limit, trials_per_iter=trials_per_iter, seed=self.seed, n_jobs=n_jobs) self.inc['hpo'], self.local_inc['hpo'] = self.default_config, self.default_config self.local_hist['fe'].append(self.original_data) self.local_hist['hpo'].append(self.default_config)