def _get_logger(self, name): logger_name = 'SolnML-%s(%d)' % (name, self.seed) setup_logger( os.path.join(self.output_dir, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def _get_logger(self, name): import os logger_name = 'solnml-%s-%d:%s' % (self.task_id, self._seed, name) setup_logger(os.path.join(self.tmp_directory, '%s.log' % str(logger_name)), self.logging_config, ) return get_logger(logger_name)
def __init__(self, estimator_id, scorer=None, data_node=None, task_type=REGRESSION, resampling_strategy='cv', resampling_params=None, timestamp=None, output_dir=None, seed=1, if_imbal=False): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.estimator_id = estimator_id self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.task_type = task_type self.data_node = data_node self.output_dir = output_dir self.seed = seed self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.continue_training = False self.train_node = data_node.copy_() self.val_node = data_node.copy_() self.timestamp = timestamp # TODO: Top-k k? self.topk_model_saver = CombinedTopKModelSaver( k=60, model_dir=self.output_dir, identifier=timestamp)
def __init__(self, evaluator: _BaseEvaluator, config_space, name, timestamp, eval_type, output_dir=None, seed=None): self.evaluator = evaluator self.config_space = config_space assert name in ['hpo', 'fe'] self.name = name self.seed = np.random.random_integers( MAX_INT) if seed is None else seed self.start_time = time.time() self.timing_list = list() self.incumbent = None self.eval_type = eval_type self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.init_hpo_iter_num = None self.early_stopped_flag = False self.timestamp = timestamp self.output_dir = output_dir self.topk_saver = CombinedTopKModelSaver(k=50, model_dir=self.output_dir, identifier=self.timestamp)
def __init__(self, clf_config, task_type, model_dir='data/dl_models/', max_epoch=150, scorer=None, dataset=None, continue_training=True, device='cpu', seed=1, timestamp=None, **kwargs): self.hpo_config = clf_config self.task_type = task_type self.max_epoch = max_epoch self.scorer = scorer if scorer is not None else accuracy_scorer self.dataset = copy.deepcopy(dataset) self.continue_training = continue_training self.seed = seed self.timestamp = timestamp self.eval_id = 0 self.onehot_encoder = None self.topk_model_saver = TopKModelSaver(k=20, model_dir=model_dir, identifier=timestamp) self.model_dir = model_dir self.device = device self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) if task_type == IMG_CLS: self.image_size = kwargs['image_size']
def __init__(self, fixed_config=None, scorer=None, data_node=None, task_type=0, resampling_strategy='cv', resampling_params=None, timestamp=None, output_dir=None, seed=1, if_imbal=False): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.fixed_config = fixed_config self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.if_imbal = if_imbal self.task_type = task_type self.data_node = data_node self.output_dir = output_dir self.seed = seed self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.continue_training = False self.train_node = data_node.copy_() self.val_node = data_node.copy_() self.timestamp = timestamp
def __init__(self, clf_config, fe_config, estimator_id, if_imbal=False, scorer=None, data_node=None, name=None, resampling_strategy='cv', resampling_params=None, seed=1, timestamp=None, output_dir=None): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.hpo_config = clf_config # TODO: Optimize: Fit the same transformers only once self.fe_config = fe_config self.estimator_id = estimator_id self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.if_imbal = if_imbal self.data_node = data_node self.name = name self.seed = seed self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.output_dir = output_dir self.timestamp = timestamp self.train_node = data_node.copy_() self.val_node = data_node.copy_() self.continue_training = False self.topk_model_saver = BanditTopKModelSaver(k=60, model_dir=self.output_dir, identifier=timestamp)
def __init__(self, eval_func, config_space, seed=1, R=81, eta=3, n_jobs=1): self.eval_func = eval_func self.config_space = config_space self.n_workers = n_jobs self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = list() self.incumbent_perfs = list() self.evaluation_stats = dict() self.evaluation_stats['timestamps'] = list() self.evaluation_stats['val_scores'] = list() self.global_start_time = time.time() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in MFSE-HB. self.weight_update_id = 0 self.iterate_r = [] self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate(np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() types, bounds = get_types(self.config_space) self.num_config = len(bounds) init_weight = [1. / self.s_max] * self.s_max + [0.] self.weighted_surrogate = WeightedRandomForestCluster(types, bounds, self.s_max, self.eta, init_weight, 'gpoe') self.weight_changed_cnt = 0 self.hist_weights = list() self.weighted_acquisition_func = EI(model=self.weighted_surrogate) self.weighted_acq_optimizer = RandomSampling(self.weighted_acquisition_func, self.config_space, n_samples=2000, rng=np.random.RandomState(seed)) self.eval_dict = dict()
def __init__(self, name, task_type, datanode, seed=1): self.name = name self._seed = seed self.root_node = datanode.copy_() self.incumbent = self.root_node self.task_type = task_type self.graph = TransformationGraph() self.graph.add_node(self.root_node) self.time_budget = None self.maximum_evaluation_num = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
def __init__(self, evaluator: _BaseEvaluator, config_space, seed=None): self.evaluator = evaluator self.config_space = config_space self.seed = np.random.random_integers( MAX_INT) if seed is None else seed self.start_time = time.time() self.timing_list = list() self.incumbent = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.init_hpo_iter_num = None
def __init__(self, n_algorithm=3, task_type=None, metric='acc', rep=3, total_resource=20, meta_algorithm='lightgbm', exclude_datasets=None, meta_dir=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.n_algorithm = n_algorithm self.n_algo_candidates = len(_buildin_algorithms) self.task_type = task_type self.meta_algo = meta_algorithm if task_type in CLS_TASKS: if metric not in ['acc', 'bal_acc']: self.logger.info( 'Meta information about metric-%s does not exist, use accuracy instead.' % str(metric)) metric = 'acc' elif task_type in REG_TASKS: raise NotImplementedError() else: raise ValueError('Invalid metric: %s.' % metric) self.metric = metric self.rep = rep self.total_resource = total_resource self.exclude_datasets = exclude_datasets self.meta_learner = None buildin_loc = os.path.dirname(__file__) + '/../meta_resource/' self.meta_dir = meta_dir if meta_dir is not None else buildin_loc if self.exclude_datasets is None: self.hash_id = 'none' else: self.exclude_datasets = list(set(exclude_datasets)) exclude_str = ','.join(sorted(self.exclude_datasets)) md5 = hashlib.md5() md5.update(exclude_str.encode('utf-8')) self.hash_id = md5.hexdigest() meta_datasets = set() meta_runs_dir = self.meta_dir + 'meta_runs/%s/' % self.metric for _record in os.listdir(meta_runs_dir): if _record.endswith('.pkl') and _record.find('-') != -1: meta_name = '-'.join(_record.split('-')[:-4]) if self.exclude_datasets is not None and meta_name in self.exclude_datasets: continue meta_datasets.add(meta_name) self._buildin_datasets = list(meta_datasets) if not self.meta_dir.endswith('/'): self.meta_dir += '/'
def __init__(self, clf_config, scorer=None, data_node=None, name=None, resampling_strategy='cv', resampling_params=None, seed=1): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.hpo_config = clf_config self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.data_node = data_node self.name = name self.seed = seed self.eval_id = 0 self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
def __init__(self, name, task_type, datanode, seed=1): self.name = name self._seed = seed self.root_node = datanode.copy_() self.incumbent = self.root_node self.task_type = task_type self.graph = TransformationGraph() self.graph.add_node(self.root_node) self.time_budget = None self.maximum_evaluation_num = None logger_name = '%s(%d)' % (self.name, self._seed) self.logger = get_logger(logger_name)
def __init__(self, reg_config, scorer=None, data_node=None, name=None, resampling_strategy='holdout', resampling_params=None, seed=1, estimator=None): self.hpo_config = reg_config self.scorer = scorer self.data_node = data_node self.name = name self.estimator = estimator self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.seed = seed self.eval_id = 0 self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
def __init__(self, n_algorithm=3, task_type=None, metric='acc', exclude_datasets=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) super().__init__(n_algorithm, task_type, metric=metric, meta_algorithm='lightgbm', exclude_datasets=exclude_datasets) self.model = None
def __init__(self, rep=3, metric='acc', n_algorithm=3, task_type=None, total_resource=1200, exclude_datasets=None, meta_dir=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) super().__init__(n_algorithm, task_type, metric, rep, total_resource, 'ranknet', exclude_datasets, meta_dir) self.model = None
def __init__(self, eval_func, config_space, seed=1, R=81, eta=3, n_jobs=1, output_dir='./'): self.eval_func = eval_func self.config_space = config_space self.n_workers = n_jobs self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = list() self.incumbent_perfs = list() self.evaluation_stats = dict() self.evaluation_stats['timestamps'] = list() self.evaluation_stats['val_scores'] = list() self.global_start_time = time.time() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in MFSE-HB. self.iterate_r = [] self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate( np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() self.mf_advisor = MFBatchAdvisor(config_space, output_dir=output_dir) self.eval_dict = dict()
def __init__(self, task_type, architectures, time_limit, sampling_strategy='uniform', R=27, eta=3, N=9, n_jobs=1): self.architectures = architectures self.time_limit = time_limit self.task_type = task_type self.n_jobs = n_jobs self.R = R self.eta = eta self.N = N self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.sampling_strategy = sampling_strategy self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) from solnml.components.models.img_classification import _classifiers as _img_estimators, _addons as _img_addons from solnml.components.models.text_classification import _classifiers as _text_estimators, \ _addons as _text_addons from solnml.components.models.object_detection import _classifiers as _od_estimators, _addons as _od_addons self.time_limit = time_limit self.elimination_strategy = 'bandit' # Runtime stats. self.evaluation_stats = dict() self.update_cs = dict() if task_type == IMG_CLS: self._estimators = _img_estimators self._addons = _img_addons elif task_type == TEXT_CLS: self._estimators = _text_estimators self._addons = _text_addons elif task_type == OBJECT_DET: self._estimators = _od_estimators self._addons = _od_addons else: raise ValueError("Unknown task type %s" % task_type) self.eval_hist_configs = dict() self.eval_hist_perfs = dict() self.tpe_config_gen = dict() self.mfse_config_gen = dict()
def __init__(self, scorer=None, data_node=None, task_type=0, resampling_strategy='cv', resampling_params=None, seed=1): self.resampling_strategy = resampling_strategy self.resampling_params = resampling_params self.scorer = scorer if scorer is not None else balanced_accuracy_scorer self.data_node = data_node self.seed = seed self.eval_id = 0 self.onehot_encoder = None self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.continue_training = False tmp_evaluator = ClassificationEvaluator(None) self.tmp_bo = AnotherBayesianOptimizationOptimizer( task_type, data_node, tmp_evaluator, 'adaboost', 1, 1, 1)
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, max_epoch: int, metric: _BaseScorer, timestamp: float, output_dir=None, device='cpu'): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.max_epoch = max_epoch self.metric = metric self.output_dir = output_dir self.device = device self.seed = 1 self.timestamp = str(timestamp) logger_name = 'EnsembleBuilder' self.logger = get_logger(logger_name)
def calculate_all_metafeatures(X, y, categorical, dataset_name, task_type, calculate=None, dont_calculate=None, densify_threshold=1000): logger = get_logger(__name__) """Calculate all metafeatures.""" helper_functions.clear() metafeatures.clear() mf_ = dict() visited = set() to_visit = deque() to_visit.extend(metafeatures) X_transformed = None y_transformed = None func_cls = [ 'NumberOfClasses', 'LogNumberOfFeatures', 'ClassProbabilityMin', 'ClassProbabilityMax', 'ClassProbabilityMean', "ClassProbabilitySTD", 'ClassEntropy', 'LandmarkLDA', 'LandmarkNaiveBayes', 'LandmarkDecisionTree', 'LandmarkDecisionNodeLearner', 'LandmarkRandomNodeLearner', 'LandmarkWorstNodeLearner', 'Landmark1NN' ] # TODO calculate the numpy metafeatures after all others to consume less # memory while len(to_visit) > 0: name = to_visit.pop() if calculate is not None and name not in calculate: continue if dont_calculate is not None and name in dont_calculate: continue if name in func_cls and task_type not in CLS_TASKS: continue if name in npy_metafeatures: if X_transformed is None: # TODO make sure this is done as efficient as possible (no copy for # sparse matrices because of wrong sparse format) sparse = scipy.sparse.issparse(X) imputer = SimpleImputer(strategy='most_frequent', copy=False) X_transformed = imputer.fit_transform(X.copy()) if any(categorical): categorical_idx = [ idx for idx, i in enumerate(categorical) if i ] ohe = ColumnTransformer( [('one-hot', OneHotEncoder(), categorical_idx)], remainder="passthrough") X_transformed = ohe.fit_transform(X_transformed) center = not scipy.sparse.isspmatrix(X_transformed) standard_scaler = StandardScaler(copy=False, with_mean=center) X_transformed = standard_scaler.fit_transform(X_transformed) categorical_transformed = [False] * X_transformed.shape[1] # Densify the transformed matrix if not sparse and scipy.sparse.issparse(X_transformed): bytes_per_float = X_transformed.dtype.itemsize num_elements = X_transformed.shape[ 0] * X_transformed.shape[1] megabytes_required = num_elements * bytes_per_float / 1000 / 1000 if megabytes_required < densify_threshold: X_transformed = X_transformed.todense() # This is not only important for datasets which are somehow # sorted in a strange way, but also prevents lda from failing in # some cases. # Because this is advanced indexing, a copy of the data is returned!!! X_transformed = check_array(X_transformed, force_all_finite=True, accept_sparse='csr') rs = np.random.RandomState(42) indices = np.arange(X_transformed.shape[0]) rs.shuffle(indices) # TODO Shuffle inplace X_transformed = X_transformed[indices] y_transformed = y[indices] X_ = X_transformed y_ = y_transformed categorical_ = categorical_transformed else: X_ = X y_ = y categorical_ = categorical dependency = metafeatures.get_dependency(name) if dependency is not None: is_metafeature = dependency in metafeatures is_helper_function = dependency in helper_functions if is_metafeature and is_helper_function: raise NotImplementedError() elif not is_metafeature and not is_helper_function: raise ValueError(dependency) elif is_metafeature and not metafeatures.is_calculated(dependency): to_visit.appendleft(name) continue elif is_helper_function and not helper_functions.is_calculated( dependency): logger.debug("%s: Going to calculate: %s", dataset_name, dependency) value = helper_functions[dependency](X_, y_, categorical_) helper_functions.set_value(dependency, value) mf_[dependency] = value logger.debug("%s: Going to calculate: %s", dataset_name, name) value = metafeatures[name](X_, y_, categorical_) metafeatures.set_value(name, value) mf_[name] = value visited.add(name) mf_ = DatasetMetafeatures(dataset_name, mf_, task_type=task_type) return mf_
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, metric: _BaseScorer, data_node, output_dir=None): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.output_dir = output_dir self.node = data_node self.predictions = [] self.train_labels = None self.timestamp = str(time.time()) logger_name = 'EnsembleBuilder' self.logger = get_logger(logger_name) for algo_id in self.stats.keys(): model_to_eval = self.stats[algo_id] for idx, (_, _, path) in enumerate(model_to_eval): with open(path, 'rb') as f: op_list, model, _ = pkl.load(f) _node = self.node.copy_() _node = construct_node(_node, op_list) # TODO: Test size test_size = 0.33 X, y = _node.data if self.task_type in CLS_TASKS: ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=1) else: ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=1) for train_index, val_index in ss.split(X, y): X_valid = X[val_index] y_valid = y[val_index] if self.train_labels is not None: assert (self.train_labels == y_valid).all() else: self.train_labels = y_valid if self.task_type in CLS_TASKS: y_valid_pred = model.predict_proba(X_valid) else: y_valid_pred = model.predict(X_valid) self.predictions.append(y_valid_pred) if len(self.predictions) < self.ensemble_size: self.ensemble_size = len(self.predictions) if ensemble_method == 'ensemble_selection': return if task_type in CLS_TASKS: self.base_model_mask = choose_base_models_classification( np.array(self.predictions), self.ensemble_size) else: self.base_model_mask = choose_base_models_regression( np.array(self.predictions), np.array(y_valid), self.ensemble_size) self.ensemble_size = sum(self.base_model_mask)
def __init__(self, n_algorithm=3, task_type=None, metric='bal_acc', rep=3, total_resource=1200, meta_algorithm='lightgbm', exclude_datasets=None, meta_dir=None): self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) self.n_algorithm = n_algorithm self.n_algo_candidates = len(_cls_builtin_algorithms) self.task_type = task_type self.meta_algo = meta_algorithm self.rep = rep self.metric = metric if task_type in CLS_TASKS: self.algorithms = _cls_builtin_algorithms self.n_algo_candidates = len(_cls_builtin_algorithms) if metric not in ['acc', 'bal_acc']: self.logger.info( 'Meta information about metric-%s does not exist, use accuracy instead.' % str(metric)) metric = 'acc' elif task_type in RGS_TASKS: self.algorithms = _rgs_builtin_algorithms self.n_algo_candidates = len(_rgs_builtin_algorithms) if metric not in ['mse']: self.logger.info( 'Meta information about metric-%s does not exist, use accuracy instead.' % str(metric)) metric = 'mse' else: raise ValueError('Invalid metric: %s.' % metric) self.total_resource = total_resource self.exclude_datasets = exclude_datasets builtin_loc = os.path.dirname(__file__) builtin_loc = os.path.join(builtin_loc, '..') builtin_loc = os.path.join(builtin_loc, 'meta_resource') self.meta_dir = meta_dir if meta_dir is not None else builtin_loc if self.exclude_datasets is None: self.hash_id = 'none' else: self.exclude_datasets = list(set(exclude_datasets)) exclude_str = ','.join(sorted(self.exclude_datasets)) md5 = hashlib.md5() md5.update(exclude_str.encode('utf-8')) self.hash_id = md5.hexdigest() meta_datasets = set() _folder = os.path.join(self.meta_dir, 'meta_dataset_vec') if task_type in CLS_TASKS: task_prefix = 'cls' else: task_prefix = 'rgs' embedding_path = os.path.join( _folder, '%s_meta_dataset_embedding.pkl' % task_prefix) with open(embedding_path, 'rb') as f: d = pkl.load(f) meta_datasets = d['task_ids'] self._builtin_datasets = sorted(list(meta_datasets)) self.metadata_manager = MetaDataManager(self.meta_dir, self.algorithms, self._builtin_datasets, metric, total_resource, task_type=task_type, rep=rep) self.meta_learner = None
def __init__(self, node_list, node_index, task_type, timestamp, fe_config_space: ConfigurationSpace, cash_config_space: ConfigurationSpace, data: DataNode, fixed_config=None, trial_num=0, time_limit=None, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', resampling_params=None, n_jobs=1, seed=1): # Tree setting self.node_list = node_list self.node_index = node_index # Set up backend. self.dataset_name = dataset_name self.trial_num = trial_num self.time_limit = time_limit self.per_run_time_limit = per_run_time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Basic settings. self.eval_type = eval_type self.resampling_params = resampling_params self.task_type = task_type self.timestamp = timestamp self.fe_config_space = fe_config_space self.cash_config_space = cash_config_space self.fixed_config = fixed_config self.original_data = data.copy_() self.metric = get_metric(metric) self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.n_jobs = n_jobs self.seed = seed self.output_dir = output_dir self.early_stop_flag = False self.timeout_flag = False self.incumbent_perf = -float("INF") self.incumbent = None self.eval_dict = dict() if self.task_type in CLS_TASKS: self.if_imbal = is_imbalanced_dataset(self.original_data) else: self.if_imbal = False self.es = None
def __init__(self, eval_func, config_space, config_generator='tpe', seed=1, R=27, eta=3, n_jobs=1): self.eval_func = eval_func self.config_space = config_space self.config_generator = config_generator self.n_workers = n_jobs self.trial_cnt = 0 self.configs = list() self.perfs = list() self.incumbent_perf = float("-INF") self.incumbent_config = self.config_space.get_default_configuration() self.incumbent_configs = list() self.incumbent_perfs = list() self.global_start_time = time.time() self.time_ticks = list() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__) # Parameters in Hyperband framework. self.restart_needed = True self.R = R self.eta = eta self.seed = seed self.logeta = lambda x: log(x) / log(self.eta) self.s_max = int(self.logeta(self.R)) self.B = (self.s_max + 1) * self.R self.s_values = list(reversed(range(self.s_max + 1))) self.inner_iter_id = 0 # Parameters in BOHB. self.iterate_r = list() self.target_x = dict() self.target_y = dict() self.exp_output = dict() for index, item in enumerate( np.logspace(0, self.s_max, self.s_max + 1, base=self.eta)): r = int(item) self.iterate_r.append(r) self.target_x[r] = list() self.target_y[r] = list() types, bounds = get_types(self.config_space) self.num_config = len(bounds) self.surrogate = RandomForestWithInstances(types, bounds) # self.executor = ParallelEvaluator(self.eval_func, n_worker=n_jobs) # self.executor = ParallelProcessEvaluator(self.eval_func, n_worker=n_jobs) self.acquisition_func = EI(model=self.surrogate) self.acq_optimizer = RandomSampling(self.acquisition_func, self.config_space, n_samples=2000, rng=np.random.RandomState(seed)) self.config_gen = TPE(config_space) self.eval_dict = dict()
def __init__(self): self.logger = get_logger(__name__)
def __init__(self, task_type, trial_num, classifier_ids: List[str], data: DataNode, metric='acc', ensemble_method='ensemble_selection', ensemble_size=10, per_run_time_limit=300, output_dir=None, dataset_name='default_dataset', eval_type='holdout', share_feature=False, inner_opt_algorithm='rb', fe_algo='bo', time_limit=None, n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ self.timestamp = time.time() self.task_type = task_type self.metric = get_metric(metric) self.original_data = data.copy_() self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.trial_num = trial_num self.n_jobs = n_jobs self.alpha = 4 self.B = 0.01 self.seed = seed self.shared_mode = share_feature self.output_dir = output_dir np.random.seed(self.seed) # Best configuration. self.optimal_algo_id = None self.nbest_algo_ids = None self.best_lower_bounds = None self.es = None # Set up backend. self.dataset_name = dataset_name self.time_limit = time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Bandit settings. self.incumbent_perf = -1. self.arms = classifier_ids self.include_algorithms = classifier_ids self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.fe_datanodes = dict() self.eval_type = eval_type self.fe_algo = fe_algo self.inner_opt_algorithm = inner_opt_algorithm for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.fe_datanodes[arm] = list() self.sub_bandits[arm] = SecondLayerBandit( self.task_type, arm, self.original_data, metric=self.metric, output_dir=output_dir, per_run_time_limit=per_run_time_limit, share_fe=self.shared_mode, seed=self.seed, eval_type=eval_type, dataset_id=dataset_name, n_jobs=self.n_jobs, fe_algo=fe_algo, mth=inner_opt_algorithm, ) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list()
def __init__(self, configspace, min_points_in_model=None, top_n_percent=15, num_samples=64, random_fraction=1 / 3, bandwidth_factor=3, min_bandwidth=1e-3, **kwargs): """ Fits for each given budget a kernel density estimator on the best N percent of the evaluated configurations on this budget. Parameters: ----------- configspace: ConfigSpace Configuration space object top_n_percent: int Determines the percentile of configurations that will be used as training data for the kernel density estimator, e.g if set to 10 the 10% best configurations will be considered for training. min_points_in_model: int minimum number of datapoints needed to fit a model num_samples: int number of samples drawn to optimize EI via sampling random_fraction: float fraction of random configurations returned bandwidth_factor: float widens the bandwidth for contiuous parameters for proposed points to optimize EI min_bandwidth: float to keep diversity, even when all (good) samples have the same value for one of the parameters, a minimum bandwidth (Default: 1e-3) is used instead of zero. """ super().__init__(**kwargs) self.top_n_percent = top_n_percent self.configspace = configspace self.bw_factor = bandwidth_factor self.min_bandwidth = min_bandwidth self.min_points_in_model = min_points_in_model if min_points_in_model is None: self.min_points_in_model = len( self.configspace.get_hyperparameters()) + 1 if self.min_points_in_model < len( self.configspace.get_hyperparameters()) + 1: self.min_points_in_model = len( self.configspace.get_hyperparameters()) + 1 self.num_samples = num_samples self.random_fraction = random_fraction hps = self.configspace.get_hyperparameters() self.kde_vartypes = "" self.vartypes = [] for h in hps: if hasattr(h, 'sequence'): raise RuntimeError( 'This version on BOHB does not support ordinal hyperparameters. Please encode %s as an integer parameter!' % (h.name)) if hasattr(h, 'choices'): self.kde_vartypes += 'u' self.vartypes += [len(h.choices)] else: self.kde_vartypes += 'c' self.vartypes += [0] self.vartypes = np.array(self.vartypes, dtype=int) # store precomputed probs for the categorical parameters self.cat_probs = [] self.configs = dict() self.losses = dict() self.good_config_rankings = dict() self.kde_models = dict() self.logger = get_logger(self.__module__ + "." + self.__class__.__name__)
def __init__(self, stats, ensemble_method: str, ensemble_size: int, task_type: int, metric: _BaseScorer, base_save=False, output_dir=None): self.stats = stats self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.task_type = task_type self.metric = metric self.output_dir = output_dir self.train_predictions = [] self.config_list = [] self.train_data_dict = {} self.train_labels = None self.seed = self.stats['split_seed'] self.timestamp = str(time.time()) logger_name = 'EnsembleBuilder' self.logger = get_logger(logger_name) model_cnt = 0 for algo_id in self.stats["include_algorithms"]: model_to_eval = self.stats[algo_id]['model_to_eval'] for idx, (node, config) in enumerate(model_to_eval): X, y = node.data # TODO: Hyperparameter test_size = 0.33 if self.task_type in CLS_TASKS: ss = StratifiedShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) else: ss = ShuffleSplit(n_splits=1, test_size=test_size, random_state=self.seed) for train_index, test_index in ss.split(X, y): X_train, X_valid = X[train_index], X[test_index] y_train, y_valid = y[train_index], y[test_index] if self.train_labels is not None: assert (self.train_labels == y_valid).all() else: self.train_labels = y_valid estimator = fetch_predict_estimator( self.task_type, config, X_train, y_train, weight_balance=node.enable_balance, data_balance=node.data_balance) if base_save: # For ensemble selection with open( os.path.join( self.output_dir, '%s-model%d' % (self.timestamp, model_cnt)), 'wb') as f: pkl.dump(estimator, f) if self.task_type in CLS_TASKS: y_valid_pred = estimator.predict_proba(X_valid) else: y_valid_pred = estimator.predict(X_valid) self.train_predictions.append(y_valid_pred) model_cnt += 1 if len(self.train_predictions) < self.ensemble_size: self.ensemble_size = len(self.train_predictions) if ensemble_method == 'ensemble_selection': return if task_type in CLS_TASKS: self.base_model_mask = choose_base_models_classification( np.array(self.train_predictions), self.ensemble_size) else: self.base_model_mask = choose_base_models_regression( np.array(self.train_predictions), np.array(y_valid), self.ensemble_size) self.ensemble_size = sum(self.base_model_mask)
def __init__(self, task_type, trial_num, classifier_ids: List[str], data: DataNode, include_preprocessors=None, time_limit=None, metric='acc', ensemble_method='ensemble_selection', ensemble_size=50, per_run_time_limit=300, output_dir="logs", dataset_name='default_dataset', eval_type='holdout', inner_opt_algorithm='fixed', enable_fe=True, fe_algo='bo', n_jobs=1, seed=1): """ :param classifier_ids: subset of {'adaboost','bernoulli_nb','decision_tree','extra_trees','gaussian_nb','gradient_boosting', 'gradient_boosting','k_nearest_neighbors','lda','liblinear_svc','libsvm_svc','multinomial_nb','passive_aggressive','qda', 'random_forest','sgd'} """ self.timestamp = time.time() self.task_type = task_type self.include_preprocessors = include_preprocessors self.metric = get_metric(metric) self.original_data = data.copy_() self.ensemble_method = ensemble_method self.ensemble_size = ensemble_size self.trial_num = trial_num self.n_jobs = n_jobs self.alpha = 4 self.seed = seed self.output_dir = output_dir self.early_stop_flag = False # np.random.seed(self.seed) # Best configuration. self.optimal_algo_id = None self.nbest_algo_ids = None self.best_lower_bounds = None self.es = None # Set up backend. self.dataset_name = dataset_name self.time_limit = time_limit self.start_time = time.time() self.logger = get_logger('Soln-ml: %s' % dataset_name) # Bandit settings. self.incumbent_perf = -float("INF") self.arms = classifier_ids self.include_algorithms = classifier_ids self.rewards = dict() self.sub_bandits = dict() self.evaluation_cost = dict() self.eval_type = eval_type self.enable_fe = enable_fe self.fe_algo = fe_algo self.inner_opt_algorithm = inner_opt_algorithm # Record the execution cost for each arm. if not (self.time_limit is None) ^ (self.trial_num is None): raise ValueError('Please set one of time_limit or trial_num.') self.arm_cost_stats = dict() for _arm in self.arms: self.arm_cost_stats[_arm] = list() for arm in self.arms: self.rewards[arm] = list() self.evaluation_cost[arm] = list() self.sub_bandits[arm] = SecondLayerBandit( self.task_type, arm, self.original_data, include_preprocessors=self.include_preprocessors, metric=self.metric, output_dir=output_dir, per_run_time_limit=per_run_time_limit, seed=self.seed, eval_type=eval_type, dataset_id=dataset_name, n_jobs=self.n_jobs, fe_algo=fe_algo, mth=self.inner_opt_algorithm, timestamp=self.timestamp) self.action_sequence = list() self.final_rewards = list() self.start_time = time.time() self.time_records = list()