def main_cli(self): ''' main function of SMAC for CLI interface ''' cmd_reader = CMDReader() args_, misc_args = cmd_reader.read_cmd() logging.basicConfig(level=args_.verbose_level) root_logger = logging.getLogger() root_logger.setLevel(args_.verbose_level) scen = Scenario(args_.scenario_file, misc_args) try: smbo = SMBO(scenario=scen, rng=np.random.RandomState(args_.seed)) smbo.run(max_iters=args_.max_iterations) finally: smbo.stats.print_stats() self.logger.info("Final Incumbent: %s" % (smbo.incumbent)) smbo.runhistory.save_json( fn=os.path.join(scen.output_dir, "runhistory.json"))
def test_get_next_by_local_search(self, patch): # Without known incumbent class SideEffect(object): def __init__(self): self.call_number = 0 def __call__(self, *args, **kwargs): rval = 9 - self.call_number self.call_number += 1 return (ConfigurationMock(rval), [[rval]]) patch.side_effect = SideEffect() smbo = SMBO(self.scenario, 1) rval = smbo._get_next_by_local_search(num_points=9) self.assertEqual(len(rval), 9) self.assertEqual(patch.call_count, 9) for i in range(9): self.assertIsInstance(rval[i][1], ConfigurationMock) self.assertEqual(rval[i][1].value, 9 - i) self.assertEqual(rval[i][0], 9 - i) self.assertEqual(rval[i][1].origin, 'Local Search') # With known incumbent patch.side_effect = SideEffect() smbo.incumbent = 'Incumbent' rval = smbo._get_next_by_local_search(num_points=10) self.assertEqual(len(rval), 10) self.assertEqual(patch.call_count, 19) # Only the first local search in each iteration starts from the # incumbent self.assertEqual(patch.call_args_list[9][0][0], 'Incumbent') for i in range(10): self.assertEqual(rval[i][1].origin, 'Local Search')
def test_choose_next(self): seed = 42 smbo = SMBO(self.scenario, seed) smbo.runhistory = RunHistory() X = self.scenario.cs.sample_configuration().get_array()[None, :] Y = self.branin(X) x = smbo.choose_next(X, Y)[0].get_array() assert x.shape == (2, )
def test_get_next_by_random_search(self, patch): def side_effect(size): return [ConfigurationMock()] * size patch.side_effect = side_effect smbo = SMBO(self.scenario, 1) rval = smbo._get_next_by_random_search(10, False) self.assertEqual(len(rval), 10) for i in range(10): self.assertIsInstance(rval[i][1], ConfigurationMock) self.assertEqual(rval[i][1].origin, 'Random Search') self.assertEqual(rval[i][0], 0)
def test_init_only_scenario_runtime(self): self.scenario.run_obj = 'runtime' self.scenario.cutoff = 300 smbo = SMBO(self.scenario) self.assertIsInstance(smbo.model, RandomForestWithInstances) np.testing.assert_allclose(smbo.types, smbo.model.types) self.assertIsInstance(smbo.rh2EPM, RunHistory2EPM4LogCost) self.assertIsInstance(smbo.acquisition_func, EI)
def test_rng(self): smbo = SMBO(self.scenario, rng=None) self.assertIsInstance(smbo.rng, np.random.RandomState) self.assertIsInstance(smbo.num_run, int) smbo = SMBO(self.scenario, rng=1) rng = np.random.RandomState(1) self.assertEqual(smbo.num_run, 1) self.assertIsInstance(smbo.rng, np.random.RandomState) smbo = SMBO(self.scenario, rng=rng) self.assertIsInstance(smbo.num_run, int) self.assertIs(smbo.rng, rng) # ML: I don't understand the following line and it throws an error self.assertRaisesRegexp( TypeError, "Unknown type <(class|type) 'str'> for argument " 'rng. Only accepts None, int or ' 'np.random.RandomState', SMBO, self.scenario, rng='BLA')
def test_eips(self): scenario = Scenario({ 'cs': test_helpers.get_branin_config_space(), 'run_obj': 'quality', 'deterministic': True }) types = get_types(scenario.cs, None) umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types) eips = EIPS(umrfwi) rh2EPM = RunHistory2EPM4EIPS(scenario, 2) taf = ExecuteTAFunc(test_function) smbo = SMBO(scenario, model=umrfwi, acquisition_function=eips, runhistory2epm=rh2EPM, tae_runner=taf) smbo.run(5) print(smbo.incumbent) raise ValueError()
def test_get_next_by_random_search_sorted(self, patch_sample, patch_ei, patch_impute): values = (10, 1, 9, 2, 8, 3, 7, 4, 6, 5) patch_sample.return_value = [ConfigurationMock(i) for i in values] patch_ei.return_value = np.array([[_] for _ in values], dtype=float) patch_impute.side_effect = lambda x: x smbo = SMBO(self.scenario, 1) rval = smbo._get_next_by_random_search(10, True) self.assertEqual(len(rval), 10) for i in range(10): self.assertIsInstance(rval[i][1], ConfigurationMock) self.assertEqual(rval[i][1].value, 10 - i) self.assertEqual(rval[i][0], 10 - i) self.assertEqual(rval[i][1].origin, 'Random Search (sorted)') # Check that config.get_array works as desired and imputation is used # in between np.testing.assert_allclose( patch_ei.call_args[0][0], np.array(values, dtype=float).reshape((-1, 1)))
def get_tuned_config(self, scenario: ASlibScenario): ''' uses SMAC3 to determine a well-performing configuration in the configuration space self.cs on the given scenario Arguments --------- scenario: ASlibScenario ASlib Scenario at hand Returns ------- Configuration best incumbent configuration found by SMAC ''' taf = ExecuteTAFunc(functools.partial(self.run_cv, scenario=scenario)) ac_scenario = Scenario({"run_obj": "quality", # we optimize quality # at most 10 function evaluations "runcount-limit": 10, "cs": self.cs, # configuration space "deterministic": "true" }) # necessary to use stats options related to scenario information AC_Stats.scenario = ac_scenario # Optimize self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") self.logger.info("Start Configuration") self.logger.info( ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") smbo = SMBO(scenario=ac_scenario, tae_runner=taf, rng=np.random.RandomState(42)) smbo.run(max_iters=999) AC_Stats.print_stats() self.logger.info("Final Incumbent: %s" % (smbo.incumbent)) return smbo.incumbent
def test_choose_next_2(self): def side_effect(X, derivative): return np.mean(X, axis=1).reshape((-1, 1)) smbo = SMBO(self.scenario, 1) smbo.runhistory = RunHistory() smbo.model = mock.MagicMock() smbo.acquisition_func._compute = mock.MagicMock() smbo.acquisition_func._compute.side_effect = side_effect # local search would call the underlying local search maximizer, # which would have to be mocked out. Replacing the method by random # search is way easier! smbo._get_next_by_local_search = smbo._get_next_by_random_search X = smbo.rng.rand(10, 2) Y = smbo.rng.rand(10, 1) x = smbo.choose_next(X, Y) self.assertEqual(smbo.model.train.call_count, 1) self.assertEqual(smbo.acquisition_func._compute.call_count, 1) self.assertEqual(len(x), 2020) num_random_search = 0 for i in range(0, 2020, 2): self.assertIsInstance(x[i], Configuration) if x[i].origin == 'Random Search': num_random_search += 1 # Since we replace local search with random search, we have to count # the occurences of random seacrh instead self.assertEqual(num_random_search, 10) for i in range(1, 2020, 2): self.assertIsInstance(x[i], Configuration) self.assertEqual(x[i].origin, 'Random Search')
def test_init_EIPS_as_arguments(self): for objective in ['runtime', 'quality']: self.scenario.run_obj = objective types = get_types(self.scenario.cs, None) umrfwi = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types) eips = EIPS(umrfwi) rh2EPM = RunHistory2EPM4EIPS(self.scenario, 2) smbo = SMBO(self.scenario, model=umrfwi, acquisition_function=eips, runhistory2epm=rh2EPM) self.assertIs(umrfwi, smbo.model) self.assertIs(eips, smbo.acquisition_func) self.assertIs(rh2EPM, smbo.rh2EPM)
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info( "Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info( "Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info( "A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info( "A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int(self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical( 'Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[ meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances( types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate( ([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info( "Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info( "Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError( 'No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next( X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info( "Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info( "Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time( ) - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)
"run_obj": "quality", # we optimize quality (alternative runtime) "runcount-limit": 200, # at most 200 function evaluations "cs": cs, # configuration space "deterministic": "true", #"tuner-timeout" : 1, #"wallclock_limit": 2 }) stats = Stats(scenario) # register function to be optimize taf = ExecuteTAFunc(rosenbrock_4d, stats) # example call of the function # it returns: Status, Cost, Runtime, Additional Infos def_value = taf.run(cs.get_default_configuration())[1] print("Default Value: %.2f" % (def_value)) # Optimize smbo = SMBO(scenario=scenario, tae_runner=taf, stats=stats, rng=np.random.RandomState(42)) try: smbo.run(max_iters=999) finally: smbo.stats.print_stats() print("Final Incumbent: %s" % (smbo.incumbent)) inc_value = taf.run(smbo.incumbent)[1] print("Optimized Value: %.2f" % (inc_value))
def __init__( self, scenario: Scenario, # TODO: once we drop python3.4 add type hint # typing.Union[ExecuteTARun, callable] tae_runner=None, runhistory: RunHistory = None, intensifier: Intensifier = None, acquisition_function: AbstractAcquisitionFunction = None, model: AbstractEPM = None, runhistory2epm: AbstractRunHistory2EPM = None, initial_design: InitialDesign = None, initial_configurations: typing.List[Configuration] = None, stats: Stats = None, rng: np.random.RandomState = None): ''' Facade to use SMAC default mode Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object tae_runner: ExecuteTARun or callable Callable or implementation of :class:`ExecuteTaRun`. In case a callable is passed it will be wrapped by tae.ExecuteTaFunc(). If not set, tae_runner will be initialized with the tae.ExecuteTARunOld() runhistory: RunHistory runhistory to store all algorithm runs intensifier: Intensifier intensification object to issue a racing to decide the current incumbent acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction. Will use EI if not set. model : AbstractEPM Model that implements train() and predict(). Will use a RandomForest if not set. runhistory2epm : RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use RunHistory2EPM4Cost if objective is cost or RunHistory2EPM4LogCost if objective is runtime. initial_design: InitialDesign initial sampling design initial_configurations: typing.List[Configuration] list of initial configurations for initial design -- cannot be used together with initial_design stats: Stats optional stats object rng: np.random.RandomState Random number generator ''' self.logger = logging.getLogger("SMAC") aggregate_func = average_cost # initialize stats object if stats: self.stats = stats else: self.stats = Stats(scenario) # initialize empty runhistory if runhistory is None: runhistory = RunHistory(aggregate_func=aggregate_func) # initial random number generator num_run, rng = self._get_rng(rng=rng) # reset random number generator in config space to draw different # random configurations with each seed given to SMAC scenario.cs.seed(rng.randint(MAXINT)) # initial Trajectory Logger traj_logger = TrajLogger(output_dir=scenario.output_dir, stats=self.stats) # initial EPM types = get_types(scenario.cs, scenario.feature_array) if model is None: model = RandomForestWithInstances( types=types, instance_features=scenario.feature_array, seed=rng.randint(MAXINT)) # initial acquisition function if acquisition_function is None: acquisition_function = EI(model=model) # initialize optimizer on acquisition function local_search = LocalSearch(acquisition_function, scenario.cs) # initialize tae_runner # First case, if tae_runner is None, the target algorithm is a call # string in the scenario file if tae_runner is None: tae_runner = ExecuteTARunOld(ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, runhistory=runhistory, par_factor=scenario.par_factor) # Second case, the tae_runner is a function to be optimized elif callable(tae_runner): tae_runner = ExecuteTAFuncDict(ta=tae_runner, stats=self.stats, run_obj=scenario.run_obj, memory_limit=scenario.memory_limit, runhistory=runhistory, par_factor=scenario.par_factor) # Third case, if it is an ExecuteTaRun we can simply use the # instance. Otherwise, the next check raises an exception elif not isinstance(tae_runner, ExecuteTARun): raise TypeError("Argument 'tae_runner' is %s, but must be " "either a callable or an instance of " "ExecuteTaRun. Passing 'None' will result in the " "creation of target algorithm runner based on the " "call string in the scenario file." % type(tae_runner)) # Check that overall objective and tae objective are the same if tae_runner.run_obj != scenario.run_obj: raise ValueError("Objective for the target algorithm runner and " "the scenario must be the same, but are '%s' and " "'%s'" % (tae_runner.run_obj, scenario.run_obj)) # inject stats if necessary if tae_runner.stats is None: tae_runner.stats = self.stats # inject runhistory if necessary if tae_runner.runhistory is None: tae_runner.runhistory = runhistory # initial intensification if intensifier is None: intensifier = Intensifier( tae_runner=tae_runner, stats=self.stats, traj_logger=traj_logger, rng=rng, instances=scenario.train_insts, cutoff=scenario.cutoff, deterministic=scenario.deterministic, run_obj_time=scenario.run_obj == "runtime", instance_specifics=scenario.instance_specific, minR=scenario.minR, maxR=scenario.maxR) # initial design if initial_design is not None and initial_configurations is not None: raise ValueError( "Either use initial_design or initial_configurations; but not both" ) if initial_configurations is not None: initial_design = MultiConfigInitialDesign( tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, runhistory=runhistory, rng=rng, configs=initial_configurations, intensifier=intensifier, aggregate_func=aggregate_func) elif initial_design is None: if scenario.initial_incumbent == "DEFAULT": initial_design = DefaultConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) elif scenario.initial_incumbent == "RANDOM": initial_design = RandomConfiguration(tae_runner=tae_runner, scenario=scenario, stats=self.stats, traj_logger=traj_logger, rng=rng) else: raise ValueError("Don't know what kind of initial_incumbent " "'%s' is" % scenario.initial_incumbent) # initial conversion of runhistory into EPM data if runhistory2epm is None: num_params = len(scenario.cs.get_hyperparameters()) if scenario.run_obj == "runtime": # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(scenario.cutoff) threshold = np.log10(scenario.cutoff * scenario.par_factor) imputor = RFRImputator(rs=rng, cutoff=cutoff, threshold=threshold, model=model, change_threshold=0.01, max_iter=2) runhistory2epm = RunHistory2EPM4LogCost( scenario=scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.TIMEOUT, ], imputor=imputor) elif scenario.run_obj == 'quality': runhistory2epm = RunHistory2EPM4Cost\ (scenario=scenario, num_params=num_params, success_states=[StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) self.solver = SMBO(scenario=scenario, stats=self.stats, initial_design=initial_design, runhistory=runhistory, runhistory2epm=runhistory2epm, intensifier=intensifier, aggregate_func=aggregate_func, num_run=num_run, model=model, acq_optimizer=local_search, acquisition_func=acquisition_function, rng=rng)
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info("Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits( self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info("Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info("A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info("A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ( (1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate(([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info("Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info("Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info("Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info("Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time() - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)
def test_init_only_scenario_quality(self): smbo = SMBO(self.scenario) self.assertIsInstance(smbo.model, RandomForestWithInstances) np.testing.assert_allclose(smbo.types, smbo.model.types) self.assertIsInstance(smbo.rh2EPM, RunHistory2EPM4Cost) self.assertIsInstance(smbo.acquisition_func, EI)