inc = None idx = 1 t = smac.get_trajectory() for i in range(n_iters): if idx < len(t) and i == t[idx].ta_runs - 1: inc = t[idx].incumbent idx += 1 incs.append(inc) # Offline Evaluation regret = [] runtime = [] cum_cost = 0 X, y, _ = smac.get_X_y() for i, x in enumerate(X): y = b.objective_function_test(incs[i])["function_value"] regret.append(y - info["f_opt"]) results = dict() results["method"] = "smac" results["benchmark"] = benchmark results["regret"] = regret results["run_id"] = run_id p = os.path.join(output_path, "smac") os.makedirs(p, exist_ok=True) fh = open(os.path.join(p, '%s_run_%d.json' % (benchmark, run_id)), 'w') json.dump(results, fh)
inc = None idx = 1 t = smac.get_trajectory() for i in range(n_iters): if idx < len(t) and i == t[idx].ta_runs - 1: inc = t[idx].incumbent idx += 1 incs.append(inc) # Offline Evaluation regret = [] runtime = [] cum_cost = 0 X, y, _ = smac.get_X_y() for i, x in enumerate(X): y = b.objective_function_test(incs[i])["function_value"] regret.append(y - info["f_opt"]) results = dict() results["method"] = "smac" results["benchmark"] = benchmark results["regret"] = regret results["run_id"] = run_id p = os.path.join(output_path, "smac") os.makedirs(p, exist_ok=True) fh = open(os.path.join(p, '%s_run_%d.json' % (benchmark, run_id)), 'w')
def run_smbo(self): self.watcher.start_task('SMBO') # == first things first: load the datamanager self.reset_data_manager() # == Initialize non-SMBO stuff # first create a scenario seed = self.seed self.config_space.seed(seed) num_params = len(self.config_space.get_hyperparameters()) # allocate a run history num_run = self.start_num_run # Initialize some SMAC dependencies runhistory = RunHistory(aggregate_func=average_cost) # meta_runhistory = RunHistory(aggregate_func=average_cost) # meta_runs_dataset_indices = {} # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.num_metalearning_cfgs > 0: if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (self.metric, TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory if os.path.exists(self.metadata_directory): self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) try: meta_base.remove_dataset(self.dataset_name) except: pass metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(self.dataset_name, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) with warnings.catch_warnings(): warnings.showwarning = self._send_warnings_to_log metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape( (1, -1)) self.logger.info(list(meta_features_dict.keys())) # meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) # meta_runs_index = 0 # try: # meta_durations = meta_base.get_all_runs('runtime') # read_runtime_data = True # except KeyError: # read_runtime_data = False # self.logger.critical('Cannot read runtime data.') # if self.acquisition_function == 'EIPS': # self.logger.critical('Reverting to acquisition function EI!') # self.acquisition_function = 'EI' # for meta_dataset in meta_runs.index: # meta_dataset_start_index = meta_runs_index # for meta_configuration in meta_runs.columns: # if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): # try: # config = meta_base.get_configuration_from_algorithm_index( # meta_configuration) # cost = meta_runs.loc[meta_dataset, meta_configuration] # if read_runtime_data: # runtime = meta_durations.loc[meta_dataset, # meta_configuration] # else: # runtime = 1 # # TODO read out other status types! # meta_runhistory.add(config, cost, runtime, # StatusType.SUCCESS, # instance_id=meta_dataset) # meta_runs_index += 1 # except: # # TODO maybe add warning # pass # # meta_runs_dataset_indices[meta_dataset] = ( # meta_dataset_start_index, meta_runs_index) else: meta_features = None self.logger.warning('Could not find meta-data directory %s' % metadata_directory) else: meta_features = None if meta_features is None: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] if self.resampling_strategy in [ 'partial-cv', 'partial-cv-iterative-fit' ]: num_folds = self.resampling_strategy_args['folds'] instances = [[ json.dumps({ 'task_id': self.dataset_name, 'fold': fold_number }) ] for fold_number in range(num_folds)] else: instances = [[json.dumps({'task_id': self.dataset_name})]] startup_time = self.watcher.wall_elapsed(self.dataset_name) total_walltime_limit = self.total_walltime_limit - startup_time - 5 scenario_dict = { 'cs': self.config_space, 'cutoff-time': self.func_eval_time_limit, 'memory-limit': self.memory_limit, 'wallclock-limit': total_walltime_limit, 'output-dir': self.backend.get_smac_output_directory(self.seed), 'shared-model': self.shared_mode, 'run-obj': 'quality', 'deterministic': 'true', 'instances': instances } if self.configuration_mode == 'RANDOM': scenario_dict['minR'] = len( instances) if instances is not None else 1 scenario_dict['initial_incumbent'] = 'RANDOM' self.scenario = Scenario(scenario_dict) # TODO rebuild target algorithm to be it's own target algorithm # evaluator, which takes into account that a run can be killed prior # to the model being fully fitted; thus putting intermediate results # into a queue and querying them once the time is over exclude = dict() include = dict() if self.include_preprocessors is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_preprocessors and ' 'exclude_preprocessors.') elif self.include_preprocessors is not None: include['preprocessor'] = self.include_preprocessors elif self.exclude_preprocessors is not None: exclude['preprocessor'] = self.exclude_preprocessors if self.include_estimators is not None and \ self.exclude_preprocessors is not None: raise ValueError('Cannot specify include_estimators and ' 'exclude_estimators.') elif self.include_estimators is not None: if self.task in CLASSIFICATION_TASKS: include['classifier'] = self.include_estimators elif self.task in REGRESSION_TASKS: include['regressor'] = self.include_estimators else: raise ValueError(self.task) elif self.exclude_estimators is not None: if self.task in CLASSIFICATION_TASKS: exclude['classifier'] = self.exclude_estimators elif self.task in REGRESSION_TASKS: exclude['regressor'] = self.exclude_estimators else: raise ValueError(self.task) ta = ExecuteTaFuncWithQueue( backend=self.backend, autosklearn_seed=seed, resampling_strategy=self.resampling_strategy, initial_num_run=num_run, logger=self.logger, include=include, exclude=exclude, metric=self.metric, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, **self.resampling_strategy_args) types, bounds = get_types(self.config_space, self.scenario.feature_array) # TODO extract generation of SMAC object into it's own function for # testing if self.acquisition_function == 'EI': model = RandomForestWithInstances( types=types, bounds=bounds, #instance_features=meta_features_list, seed=1, num_trees=10) rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, runhistory2epm=rh2EPM, tae_runner=ta, runhistory=runhistory) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=[ StatusType.SUCCESS, StatusType.MEMOUT, StatusType.TIMEOUT ], impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types=types, bounds=bounds, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) _smac_arguments = dict(scenario=self.scenario, model=model, rng=seed, tae_runner=ta, runhistory2epm=rh2EPM, runhistory=runhistory, acquisition_function=acquisition_function) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) if self.configuration_mode == 'SMAC': smac = SMAC(**_smac_arguments) elif self.configuration_mode in ['ROAR', 'RANDOM']: for not_in_roar in ['runhistory2epm', 'model']: if not_in_roar in _smac_arguments: del _smac_arguments[not_in_roar] smac = ROAR(**_smac_arguments) else: raise ValueError(self.configuration_mode) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) # X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # # Transform Y_meta on a per-dataset base # for meta_dataset in meta_runs_dataset_indices: # start_index, end_index = meta_runs_dataset_indices[meta_dataset] # end_index += 1 # Python indexing # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] >2.0] = 2.0 # dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) # Y_meta[start_index:end_index, 0] = 1 - ( # (1. - Y_meta[start_index:end_index, 0]) / # (1. - dataset_minimum)) # Y_meta[start_index:end_index, 0]\ # [Y_meta[start_index:end_index, 0] > 2] = 2 smac.solver.stats.start_timing() # == first, evaluate all metelearning and default configurations smac.solver.incumbent = smac.solver.initial_design.run() for challenger in metalearning_configurations: smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=[challenger], incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=self.total_walltime_limit) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break # == after metalearning run SMAC loop while True: if smac.solver.scenario.shared_model: pSMAC.read(run_history=smac.solver.runhistory, output_dirs=glob.glob( self.backend.get_smac_output_glob()), configuration_space=self.config_space, logger=self.logger) choose_next_start_time = time.time() try: challengers = self.choose_next(smac) except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() challengers = [next_config] time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) time_for_choose_next = max(time_for_choose_next, 1.0) smac.solver.incumbent, inc_perf = smac.solver.intensifier.intensify( challengers=challengers, incumbent=smac.solver.incumbent, run_history=smac.solver.runhistory, aggregate_func=smac.solver.aggregate_func, time_bound=time_for_choose_next) if smac.solver.scenario.shared_model: pSMAC.write(run_history=smac.solver.runhistory, output_directory=smac.solver.scenario.output_dir, num_run=self.seed) if smac.solver.stats.is_budget_exhausted(): break self.runhistory = smac.solver.runhistory self.trajectory = smac.solver.intensifier.traj_logger.trajectory smac.runhistory = self.runhistory self.fANOVA_input = smac.get_X_y() return self.runhistory, self.trajectory, self.fANOVA_input