def test_choose_next_2(self): def side_effect(X, derivative): return np.mean(X, axis=1).reshape((-1, 1)) smbo = SMAC(self.scenario, rng=1).solver smbo.incumbent = self.scenario.cs.sample_configuration() smbo.runhistory = RunHistory(aggregate_func=average_cost) smbo.model = mock.Mock(spec=RandomForestWithInstances) smbo.acquisition_func._compute = mock.Mock( spec=RandomForestWithInstances) smbo.acquisition_func._compute.side_effect = side_effect X = smbo.rng.rand(10, 2) Y = smbo.rng.rand(10, 1) x = smbo.choose_next(X, Y) self.assertEqual(smbo.model.train.call_count, 1) self.assertEqual(len(x), 2020) num_random_search = 0 num_local_search = 0 for i in range(0, 2020, 2): #print(x[i].origin) self.assertIsInstance(x[i], Configuration) if 'Random Search (sorted)' in x[i].origin: num_random_search += 1 elif 'Local Search' in x[i].origin: num_local_search += 1 # number of local search configs has to be least 10 # since x can have duplicates # which can be associated with the local search self.assertGreaterEqual(num_local_search, 10) for i in range(1, 2020, 2): self.assertIsInstance(x[i], Configuration) self.assertEqual(x[i].origin, 'Random Search')
def test_get_runhistory_and_trajectory(self): ta = ExecuteTAFuncDict(lambda x: x ** 2) smac = SMAC(tae_runner=ta, scenario=self.scenario) self.assertRaises(ValueError, smac.get_runhistory) self.assertRaises(ValueError, smac.get_trajectory) smac.trajectory = 'dummy' self.assertEqual(smac.get_trajectory(), 'dummy') smac.runhistory = 'dummy' self.assertEqual(smac.get_runhistory(), 'dummy')
def test_choose_next(self): seed = 42 smbo = SMAC(self.scenario, rng=seed).solver smbo.runhistory = RunHistory(aggregate_func=average_cost) X = self.scenario.cs.sample_configuration().get_array()[None, :] smbo.incumbent = self.scenario.cs.sample_configuration() Y = self.branin(X) x = smbo.choose_next(X, Y)[0].get_array() assert x.shape == (2, )
def test_choose_next_w_empty_rh(self): seed = 42 smbo = SMAC(self.scenario, rng=seed).solver smbo.runhistory = RunHistory(aggregate_func=average_cost) X = self.scenario.cs.sample_configuration().get_array()[None, :] Y = self.branin(X) self.assertRaises(ValueError, smbo.choose_next, **{"X": X, "Y": Y}) x = next(smbo.choose_next(X, Y, incumbent_value=0.0)).get_array() assert x.shape == (2, )
def test_choose_next_3(self): # Test with ten configurations in the runhistory def side_effect(X): return np.mean(X, axis=1).reshape((-1, 1)) def side_effect_predict(X): m, v = np.ones((X.shape[0], 1)), None return m, v smbo = SMAC(self.scenario, rng=1).solver smbo.incumbent = self.scenario.cs.sample_configuration() previous_configs = [smbo.incumbent] + [ self.scenario.cs.sample_configuration() for i in range(0, 20) ] smbo.runhistory = RunHistory(aggregate_func=average_cost) for i in range(0, len(previous_configs)): smbo.runhistory.add(previous_configs[i], i, 10, 1) smbo.model = mock.Mock(spec=RandomForestWithInstances) smbo.model.predict_marginalized_over_instances.side_effect = side_effect_predict smbo.acquisition_func._compute = mock.Mock( spec=RandomForestWithInstances) smbo.acquisition_func._compute.side_effect = side_effect X = smbo.rng.rand(10, 2) Y = smbo.rng.rand(10, 1) challengers = smbo.choose_next(X, Y) # Convert challenger list (a generator) to a real list challengers = [c for c in challengers] self.assertEqual(smbo.model.train.call_count, 1) # For each configuration it is randomly sampled whether to take it from the list of challengers or to sample it # completely at random. Therefore, it is not guaranteed to obtain twice the number of configurations selected # by EI. self.assertEqual(len(challengers), 9913) num_random_search_sorted = 0 num_random_search = 0 num_local_search = 0 for c in challengers: self.assertIsInstance(c, Configuration) if 'Random Search (sorted)' == c.origin: num_random_search_sorted += 1 elif 'Random Search' == c.origin: num_random_search += 1 elif 'Local Search' == c.origin: num_local_search += 1 else: raise ValueError(c.origin) self.assertEqual(num_local_search, 10) self.assertEqual(num_random_search_sorted, 4990) self.assertEqual(num_random_search, 4913)
def run_smbo(self): self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory if os.path.exists(self.metadata_directory): self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) try: meta_base.remove_dataset(self.dataset_name) except: pass metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(self.dataset_name, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) # meta_runs_index = 0 # try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True # except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: meta_features = None self.logger.warning('Could not find meta-data directory %s' % metadata_directory) else: meta_features = None if meta_features is None: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] if self.resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: num_folds = self.resampling_strategy_args['folds'] instances = [[ json.dumps({ 'task_id': self.dataset_name, 'fold': fold_number }) ] for fold_number in range(num_folds)] else: instances = [[json.dumps({'task_id': self.dataset_name})]] startup_time = self.watcher.wall_elapsed(self.dataset_name) total_walltime_limit = self.total_walltime_limit - startup_time - 5 scenario_dict = { 'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': total_walltime_limit, 'output-dir': self.backend.get_smac_output_directory(self.seed), 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true', 'instances': instances } if self.configuration_mode == 'RANDOM': scenario_dict['minR'] = len( instances) if instances is not None else 1 scenario_dict['initial_incumbent'] = 'RANDOM' self.scenario = Scenario(scenario_dict) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over exclude = dict() include = dict() if self.include_preprocessors is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_preprocessors and ' 'exclude_preprocessors.') elif self.include_preprocessors is not None: include['preprocessor'] = self.include_preprocessors elif self.exclude_preprocessors is not None: exclude['preprocessor'] = self.exclude_preprocessors if self.include_estimators is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_estimators and ' 'exclude_estimators.') elif self.include_estimators is not None: if self.task in CLASSIFICATION_TASKS: include['classifier'] = self.include_estimators elif self.task in REGRESSION_TASKS: include['regressor'] = self.include_estimators else: raise ValueError(self.task) elif self.exclude_estimators is not None: if self.task in CLASSIFICATION_TASKS: exclude['classifier'] = self.exclude_estimators elif self.task in REGRESSION_TASKS: exclude['regressor'] = self.exclude_estimators else: raise ValueError(self.task) ta = ExecuteTaFuncWithQueue( backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, include=include, exclude=exclude, metric=self.metric, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, **self.resampling_strategy_args) types, bounds = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances( types=types, bounds=bounds, #instance_features=meta_features_list, seed=1, num_trees=10) rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, runhistory2epm=rh2EPM, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types=types, bounds=bounds, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory2epm=rh2EPM, runhistory=runhistory, acquisition_function=acquisition_function) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) if self.configuration_mode == 'SMAC': smac = SMAC(**_smac_arguments) elif self.configuration_mode in ['ROAR', 'RANDOM']: for not_in_roar in ['runhistory2epm', 'model']: if not_in_roar in _smac_arguments: del _smac_arguments[not_in_roar] smac = ROAR(**_smac_arguments) else: raise ValueError(self.configuration_mode) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_dirs=glob.glob( self.backend.get_smac_output_glob()), configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) time_for_choose_next = max(time_for_choose_next, 1.0) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory self.trajectory = smac.solver.intensifier.traj_logger.trajectory smac.runhistory = self.runhistory self.fANOVA_input = smac.get_X_y() return self.runhistory, self.trajectory, self.fANOVA_input