def __init__(self, scenario, cs, model, to_evaluate: int, runhist: RunHistory, rng, n_pairs=5, minimize=True, pairwise=True, preprocessed_X=None, preprocessed_y=None, **kwargs): super().__init__(scenario, cs, model, to_evaluate, rng, **kwargs) self.name = 'fANOVA' self.logger = self.name # This way the instance features in X are ignored and a new forest is constructed if self.model.instance_features is None: self.logger.info('No preprocessing necessary') if preprocessed_X is not None and preprocessed_y is not None: self.X = preprocessed_X self.y = preprocessed_y else: self._preprocess(runhist) else: self._preprocess(runhist) cutoffs = (-np.inf, np.inf) if minimize: cutoffs = ( -np.inf, self.model.predict_marginalized_over_instances( np.array([ impute_inactive_values( self.cs.get_default_configuration()).get_array() ]))[0].flatten()[0]) elif minimize is False: cutoffs = (self.model.predict_marginalized_over_instances( np.array([ impute_inactive_values( self.cs.get_default_configuration()).get_array() ]))[0].flatten()[0], np.inf) self.evaluator = fanova_pyrfr(X=self.X, Y=self.y.flatten(), config_space=cs, cutoffs=cutoffs) self.n_most_imp_pairs = n_pairs self.num_single = None self.pairwise = pairwise
def _preprocess(self, runhistory): """ Method to marginalize over instances such that fANOVA can determine the parameter importance without having to deal with instance features. :param runhistory: RunHistory that knows all configurations that were run. For all these configurations we have to marginalize away the instance features with which fANOVA will make it's predictions """ self.logger.info('PREPROCESSING PREPROCESSING PREPROCESSING PREPROCESSING PREPROCESSING PREPROCESSING') self.logger.info('Marginalizing away all instances!') configs = runhistory.get_all_configs() if self.cs_contained_constant: configs = [Configuration(self.cs, vector=c.get_array()) for c in configs] X_non_hyper, X_prime = [], [] for config in configs: config = impute_inactive_values(config).get_array() X_prime.append(config) X_non_hyper.append(config) for idx, param in enumerate(self.cs.get_hyperparameters()): if not (isinstance(param, CategoricalHyperparameter) or isinstance(param, Constant)): X_non_hyper[-1][idx] = param._transform(X_non_hyper[-1][idx]) X_non_hyper = np.array(X_non_hyper) X_prime = np.array(X_prime) y_prime = np.array(self.model.predict_marginalized_over_instances(X_prime)[0]) self._X = X_non_hyper self._y = y_prime self.logger.info('Size of training X after preprocessing: %s' % str(self.X.shape)) self.logger.info('Size of training y after preprocessing: %s' % str(self.y.shape)) self.logger.info('Finished Preprocessing')
def test_impute_inactive_values(self): mini_autosklearn_config_space_path = os.path.join( os.path.dirname(__file__), 'test_searchspaces', 'mini_autosklearn_original.pcs') with open(mini_autosklearn_config_space_path) as fh: cs = read(fh) cs.seed(1) configuration = cs.sample_configuration() new_configuration = impute_inactive_values(configuration) self.assertNotEqual(id(configuration), id(new_configuration)) self.assertEqual(len(new_configuration._values), 11) for key in new_configuration: self.assertIsNotNone(new_configuration[key]) self.assertEqual(new_configuration['random_forest:max_features'], 10)
def test_impute_inactive_values(self): mini_autosklearn_config_space_path = os.path.join( os.path.dirname(__file__), 'test_searchspaces', 'mini_autosklearn_original.pcs') with open(mini_autosklearn_config_space_path) as fh: cs = read(fh) cs.seed(1) configuration = cs.sample_configuration() new_configuration = impute_inactive_values(configuration) self.assertNotEqual(id(configuration), id(new_configuration)) self.assertEqual(len(new_configuration._values), 11) for key in new_configuration: self.assertIsNotNone(new_configuration[key]) self.assertEqual(new_configuration['random_forest:max_features'], 10)
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info( "Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info( "Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info( "A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info( "A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info( "Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int(self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning( 'Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append( meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical( 'Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[ meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances( types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees=10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ((1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate( ([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info( "Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info( "Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError( 'No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next( X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info( "Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration, status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info( "Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time( ) - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)
def get_pred_surface(self, rh, X_scaled, conf_list: list, contour_step_size): """fit epm on the scaled input dimension and return data to plot a contour plot of the empirical performance Parameters ---------- rh: RunHistory runhistory X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects Returns ------- contour_data: (np.array, np.array, np.array) x, y, Z for contour plots """ # use PCA to reduce features to also at most 2 dims scen = copy.deepcopy(self.scenario) # pca changes feats if scen.feature_array.shape[1] > 2: self.logger.debug( "Use PCA to reduce features to from %d dim to 2 dim", scen.feature_array.shape[1]) # perform PCA insts = scen.feature_dict.keys() feature_array = np.array([scen.feature_dict[i] for i in insts]) feature_array = StandardScaler().fit_transform(feature_array) feature_array = PCA(n_components=2).fit_transform(feature_array) # inject in scenario-object scen.feature_array = feature_array scen.feature_dict = dict([(inst, feature_array[idx, :]) for idx, inst in enumerate(insts)]) scen.n_features = 2 # convert the data to train EPM on 2-dim featurespace (for contour-data) self.logger.debug("Convert data for epm.") X, y, types = convert_data_for_epm(scenario=scen, runhistory=rh, logger=self.logger) types = np.array(np.zeros((2 + scen.feature_array.shape[1])), dtype=np.uint) num_params = len(scen.cs.get_hyperparameters()) # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions conf_dict = {} for idx, c in enumerate(conf_list): conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] X_trans = [] for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] # append scaled config + pca'ed features (total of 4 values) per config/feature-sample X_trans.append( np.concatenate((x_scaled_conf, x[num_params:]), axis=0)) X_trans = np.array(X_trans) self.logger.debug("Train random forest for contour-plot.") bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=np.array( scen.feature_array), ratio_features=1.0) start = time.time() model.train(X_trans, y) self.logger.debug("Fitting random forest took %f time", time.time() - start) x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size), np.arange(y_min, y_max, contour_step_size)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min, x_max, y_min, y_max) self.logger.debug( "Predict on %d samples in grid to get surface (step-size: %f)", np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size) start = time.time() Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) self.logger.debug("Predicting random forest took %f time", time.time() - start) return xx, yy, Z
def sanitize_config(config): cs_no_forbidden = config.configuration_space cs_no_forbidden.forbidden_clauses = [] config.configuration_space = cs_no_forbidden return impute_inactive_values(config)
def __init__(self, scenario, cs, model, to_evaluate: int, runhist: RunHistory, rng, n_pairs=5, minimize=True, pairwise=True, preprocessed_X=None, preprocessed_y=None, incumbents=None, **kwargs): """ Handler to fANOVA module. Parameters ---------- scenario: Scenario scenario with information about run_objective cs: ConfigSpace configuration space of scenario to be analyzed model: empirical performance model TODO to_evaluate: int number of parameters to be plotted runhist: RunHistory TODO rng: RandomNumberGenerator rng n_pairs: int how many (most important) parameters should be plotted for pairwise marginals minimize: boolean whether optimum is min or max pairwise: boolean plot pairwise marginals preprocessed_X/Y: data preprocessed data to be reused if model is already trained on data without instance_features incumbents: List[Configuration] or Configuration one or multiple incumbents to be marked in plots """ super().__init__(scenario, cs, model, to_evaluate, rng, **kwargs) self.name = 'fANOVA' self.logger = 'pimp.' + self.name # Turn all Constants into Categoricals (fANOVA cannot handle Constants) self.cs_contained_constant = False # if any([isinstance(hp, Constant) for hp in self.cs.get_hyperparameters()]): # self.logger.debug("Replacing configspace's hyperparameter Constants by one-value Categoricals.") # new_hyperparameters = [CategoricalHyperparameter(hp.name, [hp.value]) if isinstance(hp, Constant) # else hp for hp in self.cs.get_hyperparameters()] # self.cs = ConfigurationSpace() # self.cs.add_hyperparameters(new_hyperparameters) # self.cs_contained_constant = True # This way the instance features in X are ignored and a new forest is constructed if self.model.instance_features is None: self.logger.info('No marginalization necessary') if preprocessed_X is not None and preprocessed_y is not None: self._X = preprocessed_X self._y = preprocessed_y else: self.logger.info('Preprocessing X') self._X = deepcopy(self.X) self._y = deepcopy(self.y) for c_idx, config in enumerate(self.X): # print("{}/{}".format(c_idx, len(self.X))) for p_idx, param in enumerate(self.cs.get_hyperparameters()): if not (isinstance(param, CategoricalHyperparameter) or isinstance(param, Constant)): # getting the parameters out of the hypercube setting as used in smac runhistory self._X[c_idx][p_idx] = param._transform(self.X[c_idx][p_idx]) else: self._preprocess(runhist) cutoffs = (-np.inf, np.inf) if minimize: cutoffs = (-np.inf, self.model.predict_marginalized_over_instances( np.array([impute_inactive_values(self.cs.get_default_configuration()).get_array()]))[0].flatten()[0] ) elif minimize is False: cutoffs = (self.model.predict_marginalized_over_instances( np.array([impute_inactive_values( self.cs.get_default_configuration()).get_array()]))[0].flatten()[0], np.inf) self.evaluator = fanova_pyrfr(X=self._X, Y=self._y.flatten(), config_space=self.cs, seed=self.rng.randint(2**31-1), cutoffs=cutoffs) self.n_most_imp_pairs = n_pairs self.num_single = None self.pairwise = pairwise self.evaluated_parameter_importance_uncertainty = OrderedDict() self.incumbents = incumbents
def _plot(self, configs, params, fn=None, log_c=False, logy=False): """ Parameters ---------- configs: List[Configuration] configs to be plotted params: List[str] parameters to be plotted Returns ------- output: str """ if fn is None: filename = os.path.join( self.output_dir, "parallel_coordinates_" + str(len(configs)) + '.png') else: filename = fn if len(params) < 3: self.logger.info( "Only two parameters, skipping parallel coordinates.") return # Get ALL parameter names and metrics parameter_names = impute_inactive_values( self.validated_rh.get_all_configs()[0]).keys() # configs = self.validated_rh.get_all_configs() configspace = configs[0].configuration_space # Create dataframe with configs cost_str = '' if self.runtime: cost_str = 'log-runtime' if logy else 'runtime' else: cost_str = 'log-quality' if logy else 'quality' data = [] for conf in configs: conf_dict = conf.get_dictionary() new_entry = {} # Add cost-column new_entry[cost_str] = self._fun(self.validated_rh.get_cost(conf), logy) # Add parameters for p in params: # Catch key-errors (implicate unused hyperparameter) value = conf_dict.get(p) if value is None: # Value is None, parameter unused # TODO new_entry[p] = 0 continue param = self.cs.get_hyperparameter(p) if isinstance(param, IntegerHyperparameter): new_entry[p] = int(value) elif isinstance(param, FloatHyperparameter): new_entry[p] = float(value) elif isinstance(param, CategoricalHyperparameter): new_entry[p] = param.choices.index(value) else: raise RuntimeError('No rule for parametertype %s' % str(type(param))) data.append(pd.Series(new_entry)) data = pd.DataFrame(data) # Add 'cost' to params, params serves as index for dataframe params = [cost_str] + params # Select only parameters we want to plot (specified in index) data = data[params] # Create subplots fig, axes = plt.subplots(1, len(params) - 1, sharey=False, figsize=(15, 5)) # Normalize the data for each parameter, so the displayed ranges are # meaningful. Note that the ticklabels are set to original data. min_max_diff = {} for p in params: # TODO enable full parameter scale # hyper = configspace.get_hyperparameter(p) # if isinstance(hyper, CategoricalHyperparameter): # lower = 0 # upper = len(hyper.choices)-1 # else: # lower, upper = configspace.get_hyperparameter(p).lower, configspace.get_hyperparameter(p).upper # min_max_diff[p] = [lower, upper, upper - lower] # data[p] = np.true_divide(data[p] - lower, upper - lower) min_max_diff[p] = [data[p].min(), data[p].max(), np.ptp(data[p])] data[p] = np.true_divide(data[p] - data[p].min(), np.ptp(data[p])) # setup colormap cm = plt.get_cmap('winter') scaler = colors.LogNorm if log_c else colors.Normalize if self.worst_config_performance < self.best_config_performance: normedC = scaler(vmin=self.worst_config_performance, vmax=self.best_config_performance) else: normedC = scaler(vmax=self.worst_config_performance, vmin=self.best_config_performance) scale = cmx.ScalarMappable(norm=normedC, cmap=cm) # Plot data for i, ax in enumerate(axes): # Iterate over params for idx in data.index[::-1]: # Iterate over configs cval = scale.to_rgba( self._fun(self.validated_rh.get_cost(configs[idx]), logy)) cval = (cval[2], cval[0], cval[1]) zorder = idx - 5 if idx > len(data) // 2 else len( data) - idx # -5 to have the best on top of the worst alpha = (zorder / len(data)) - 0.25 path_effects = [path_efx.Normal()] if idx in [ 0, 1, 2, 3, 4, len(data) - 1, len(data) - 2, len(data) - 3, len(data) - 4, len(data) - 5 ]: alpha = 1 path_effects = [ path_efx.withStroke(linewidth=5, foreground='k') ] ax.plot(range(len(params)), data.loc[idx, params], color=cval, alpha=alpha, linewidth=3, zorder=zorder, path_effects=path_effects) ax.set_xlim([i, i + 1]) def set_ticks_for_axis(p, ax, num_ticks=10): minimum, maximum, param_range = min_max_diff[params[p]] hyper = p if p > 0: # First column not a parameter, but cost... hyper = configspace.get_hyperparameter(params[p]) if isinstance(hyper, CategoricalHyperparameter): num_ticks = len(hyper.choices) step = 1 tick_labels = hyper.choices norm_min = data[params[p]].min() norm_range = np.ptp(data[params[p]]) norm_step = norm_range / float(num_ticks - 1) ticks = [ round(norm_min + norm_step * i, 2) for i in range(num_ticks) ] else: step = param_range / float(num_ticks) if isinstance(hyper, IntegerHyperparameter): tick_labels = [ int(minimum + step * i) for i in range(num_ticks + 1) ] else: tick_labels = [ round(minimum + step * i, 2) for i in range(num_ticks + 1) ] norm_min = data[params[p]].min() norm_range = np.ptp(data[params[p]]) norm_step = norm_range / float(num_ticks) ticks = [ round(norm_min + norm_step * i, 2) for i in range(num_ticks + 1) ] ax.yaxis.set_ticks(ticks) ax.set_yticklabels(tick_labels) # TODO adjust tick-labels to unused and maybe even log? for p, ax in enumerate(axes): ax.xaxis.set_major_locator(ticker.FixedLocator([p])) set_ticks_for_axis(p, ax, num_ticks=6) ax.set_xticklabels([params[p]], rotation=5) # Move the final axis' ticks to the right-hand side ax = plt.twinx(axes[-1]) dim = len(axes) ax.xaxis.set_major_locator( ticker.FixedLocator([len(params) - 2, len(params) - 1])) set_ticks_for_axis(dim, ax, num_ticks=6) ax.set_xticklabels([params[-2], params[-1]], rotation=5) ax.set_ylim(axes[-1].get_ylim()) # Remove spaces between subplots plt.subplots_adjust(wspace=0) plt.tight_layout() plt.subplots_adjust(wspace=0) fig.savefig(filename) plt.close(fig) return filename
def get_pred_surface(self, rh, X_scaled, conf_list: list, contour_step_size): """fit epm on the scaled input dimension and return data to plot a contour plot of the empirical performance Parameters ---------- rh: RunHistory runhistory X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects contour_step_size: float step-size for contour Returns ------- contour_data: (np.array, np.array, np.array) x, y, Z for contour plots """ # use PCA to reduce features to also at most 2 dims scen = copy.deepcopy(self.scenario) # pca changes feats if scen.feature_array.shape[1] > 2: self.logger.debug( "Use PCA to reduce features to from %d dim to 2 dim", scen.feature_array.shape[1]) # perform PCA insts = scen.feature_dict.keys() feature_array = np.array([scen.feature_dict[i] for i in insts]) feature_array = StandardScaler().fit_transform(feature_array) feature_array = PCA(n_components=2).fit_transform(feature_array) # inject in scenario-object scen.feature_array = feature_array scen.feature_dict = dict([(inst, feature_array[idx, :]) for idx, inst in enumerate(insts)]) scen.n_features = 2 # convert the data to train EPM on 2-dim featurespace (for contour-data) self.logger.debug("Convert data for epm.") X, y, types = convert_data_for_epm(scenario=scen, runhistory=rh, impute_inactive_parameters=True, logger=self.logger) types = np.array(np.zeros((2 + scen.feature_array.shape[1])), dtype=np.uint) num_params = len(scen.cs.get_hyperparameters()) # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions conf_dict = {} # Remove forbidden clauses (this is necessary to enable the impute_inactive_values-method, see #226) cs_no_forbidden = copy.deepcopy(conf_list[0].configuration_space) cs_no_forbidden.forbidden_clauses = [] for idx, c in enumerate(conf_list): c.configuration_space = cs_no_forbidden conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] # Debug compare elements: c1, c2 = {str(z) for z in X}, {str(z) for z in conf_dict.keys()} self.logger.debug( "{} elements not in both sets, {} elements in both sets, X (len {}) and conf_dict (len {}) " "(might be a problem related to forbidden clauses?)".format( len(c1 ^ c2), len(c1 & c2), len(c1 ^ c2), len(c1), len(c2))) # self.logger.debug("Elements: {}".format(str(c1 ^ c2))) X_trans = [ ] # X_trans is the same as X but with reduced 2-dim features (so shape is (N, 2) instead of (N, M)) for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] # append scaled config + pca'ed features (total of 4 values) per config/feature-sample X_trans.append( np.concatenate((x_scaled_conf, x[num_params:]), axis=0)) X_trans = np.array(X_trans) self.logger.debug( "Train random forest for contour-plot. Shape of X: {}, shape of X_trans: {}" .format(X.shape, X_trans.shape)) self.logger.debug("Faking configspace to be able to train rf...") # We need to fake config-space bypass imputation of inactive values in random forest implementation fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint") bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(fake_cs, types, bounds, seed=self.rng.randint(MAXINT), instance_features=np.array( scen.feature_array), ratio_features=1.0) start = time.time() model.train(X_trans, y) self.logger.debug("Fitting random forest took %f time", time.time() - start) x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size), np.arange(y_min, y_max, contour_step_size)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min, x_max, y_min, y_max) self.logger.debug( "Predict on %d samples in grid to get surface (step-size: %f)", np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size) start = time.time() Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) self.logger.debug("Predicting random forest took %f time", time.time() - start) return xx, yy, Z
def get_pred_surface(self, X_scaled, conf_list: list): ''' fit epm on the scaled input dimension and return data to plot a contour plot Parameters ---------- X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects Returns ------- np.array, np.array, np.array x,y,Z for contour plots ''' # use PCA to reduce features to also at most 2 dims n_feats = self.scenario.feature_array.shape[1] if n_feats > 2: self.logger.debug("Use PCA to reduce features to 2dim") insts = self.scenario.feature_dict.keys() feature_array = np.array([self.scenario.feature_dict[inst] for inst in insts]) ss = StandardScaler() self.scenario.feature_array = ss.fit_transform(feature_array) pca = PCA(n_components=2) feature_array = pca.fit_transform(feature_array) n_feats = feature_array.shape[1] self.scenario.feature_array = feature_array self.scenario.feature_dict = dict([(inst, feature_array[idx,:]) for idx, inst in enumerate(insts)]) self.scenario.n_features = 2 # Create new rh with only wanted configs new_rh = RunHistory(average_cost) for rh in self.runhistories: for key, value in rh.data.items(): config = rh.ids_config[key.config_id] if config in self.configs_to_plot: config_id, instance, seed = key cost, time, status, additional_info = value new_rh.add(config, cost, time, status, instance_id=instance, seed=seed, additional_info=additional_info) self.relevant_rh = new_rh X, y, types = convert_data(scenario=self.scenario, runhistory=new_rh) types = np.array(np.zeros((2+n_feats)), dtype=np.uint) num_params = len(self.scenario.cs.get_hyperparameters()) # impute missing values in configs conf_dict = {} for idx, c in enumerate(conf_list): conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] X_trans = [] for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] x_new = np.concatenate( (x_scaled_conf, x[num_params:]), axis=0) X_trans.append(x_new) X_trans = np.array(X_trans) bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=np.array(self.scenario.feature_array), ratio_features=1.0) model.train(X_trans, y) self.logger.debug("RF fitted") plot_step = self.contour_step_size x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f" %(x_min, x_max, y_min, y_max)) self.logger.debug("Predict on %d samples in grid to get surface" %(np.c_[xx.ravel(), yy.ravel()].shape[0])) Z, _ = model.predict_marginalized_over_instances( np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return xx, yy, Z
def __init__(self, scenario, cs, model, to_evaluate: int, runhist: RunHistory, rng, n_pairs=5, minimize=True, pairwise=True, preprocessed_X=None, preprocessed_y=None, **kwargs): super().__init__(scenario, cs, model, to_evaluate, rng, **kwargs) self.name = 'fANOVA' self.logger = 'pimp.' + self.name # Turn all Constants into Categoricals (fANOVA cannot handle Constants) self.cs_contained_constant = False if any( [isinstance(hp, Constant) for hp in self.cs.get_hyperparameters()]): self.logger.debug( "Replacing configspace's hyperparameter Constants by one-value Categoricals." ) new_hyperparameters = [ CategoricalHyperparameter(hp.name, [hp.value]) if isinstance( hp, Constant) else hp for hp in self.cs.get_hyperparameters() ] self.cs = ConfigurationSpace() self.cs.add_hyperparameters(new_hyperparameters) self.cs_contained_constant = True # This way the instance features in X are ignored and a new forest is constructed if self.model.instance_features is None: self.logger.info('No preprocessing necessary') if preprocessed_X is not None and preprocessed_y is not None: self.X = preprocessed_X self.y = preprocessed_y else: self._preprocess(runhist) else: self._preprocess(runhist) cutoffs = (-np.inf, np.inf) if minimize: cutoffs = ( -np.inf, self.model.predict_marginalized_over_instances( np.array([ impute_inactive_values( self.cs.get_default_configuration()).get_array() ]))[0].flatten()[0]) elif minimize is False: cutoffs = (self.model.predict_marginalized_over_instances( np.array([ impute_inactive_values( self.cs.get_default_configuration()).get_array() ]))[0].flatten()[0], np.inf) self.evaluator = fanova_pyrfr(X=self.X, Y=self.y.flatten(), config_space=self.cs, seed=self.rng.randint(2**31 - 1), cutoffs=cutoffs) self.n_most_imp_pairs = n_pairs self.num_single = None self.pairwise = pairwise self.evaluated_parameter_importance_uncertainty = OrderedDict()
def run_smbo(self, max_iters=1000): global evaluator # == first things first: load the datamanager self.reset_data_manager() # == Initialize SMBO stuff # first create a scenario seed = self.seed # TODO num_params = len(self.config_space.get_hyperparameters()) # allocate a run history run_history = RunHistory() meta_runhistory = RunHistory() meta_runs_dataset_indices = {} num_run = self.start_num_run instance_id = self.dataset_name + SENTINEL # == Train on subset # before doing anything, let us run the default_cfg # on a subset of the available data to ensure that # we at least have some models # we will try three different ratios of decreasing magnitude # in the hope that at least on the last one we will be able # to get a model n_data = self.datamanager.data['X_train'].shape[0] subset_ratio = 10000. / n_data if subset_ratio >= 0.5: subset_ratio = 0.33 subset_ratios = [subset_ratio, subset_ratio * 0.10] else: subset_ratios = [subset_ratio, 500. / n_data] self.logger.info("Training default configurations on a subset of " "%d/%d data points." % (int(n_data * subset_ratio), n_data)) # the time limit for these function evaluations is rigorously # set to only 1/2 of a full function evaluation subset_time_limit = max(5, int(self.func_eval_time_limit / 2)) # the configs we want to run on the data subset are: # 1) the default configs # 2) a set of configs we selected for training on a subset subset_configs = [self.config_space.get_default_configuration()] \ + self.collect_additional_subset_defaults() subset_config_succesful = [False] * len(subset_configs) for subset_config_id, next_config in enumerate(subset_configs): for i, ratio in enumerate(subset_ratios): self.reset_data_manager() n_data_subsample = int(n_data * ratio) # run the config, but throw away the result afterwards # since this cfg was evaluated only on a subset # and we don't want to confuse SMAC self.logger.info("Starting to evaluate %d on SUBSET " "with size %d and time limit %ds.", num_run, n_data_subsample, subset_time_limit) self.logger.info(next_config) _info = eval_with_limits( self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, subset_time_limit, n_data_subsample) (duration, result, _, additional_run_info, status) = _info self.logger.info("Finished evaluating %d. configuration on SUBSET. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if i < len(subset_ratios) - 1: if status != StatusType.SUCCESS: # Do not increase num_run here, because we will try # the same configuration with less data self.logger.info("A CONFIG did not finish " " for subset ratio %f -> going smaller", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break else: if status != StatusType.SUCCESS: self.logger.info("A CONFIG did not finish " " for subset ratio %f.", ratio) continue else: self.logger.info("Finished SUBSET training sucessfully " "with ratio %f", ratio) subset_config_succesful[subset_config_id] = True break # Use the first non-failing configuration from the subsets as the new # default configuration -> this guards us against the random forest # failing on large, sparse datasets default_cfg = None for subset_config_id, next_config in enumerate(subset_configs): if subset_config_succesful[subset_config_id]: default_cfg = next_config break if default_cfg is None: default_cfg = self.config_space.get_default_configuration() # == METALEARNING suggestions # we start by evaluating the defaults on the full dataset again # and add the suggestions from metalearning behind it if self.metadata_directory is None: metalearning_directory = os.path.dirname( autosklearn.metalearning.__file__) # There is no multilabel data in OpenML if self.task == MULTILABEL_CLASSIFICATION: meta_task = BINARY_CLASSIFICATION else: meta_task = self.task metadata_directory = os.path.join( metalearning_directory, 'files', '%s_%s_%s' % (METRIC_TO_STRING[self.metric], TASK_TYPES_TO_STRING[meta_task], 'sparse' if self.datamanager.info['is_sparse'] else 'dense')) self.metadata_directory = metadata_directory self.logger.info('Metadata directory: %s', self.metadata_directory) meta_base = MetaBase(self.config_space, self.metadata_directory) metafeature_calculation_time_limit = int( self.total_walltime_limit / 4) metafeature_calculation_start_time = time.time() meta_features = self._calculate_metafeatures_with_limits( metafeature_calculation_time_limit) metafeature_calculation_end_time = time.time() metafeature_calculation_time_limit = \ metafeature_calculation_time_limit - ( metafeature_calculation_end_time - metafeature_calculation_start_time) if metafeature_calculation_time_limit < 1: self.logger.warning('Time limit for metafeature calculation less ' 'than 1 seconds (%f). Skipping calculation ' 'of metafeatures for encoded dataset.', metafeature_calculation_time_limit) meta_features_encoded = None else: self.datamanager.perform1HotEncoding() meta_features_encoded = \ self._calculate_metafeatures_encoded_with_limits( metafeature_calculation_time_limit) # In case there is a problem calculating the encoded meta-features if meta_features is None: if meta_features_encoded is not None: meta_features = meta_features_encoded else: if meta_features_encoded is not None: meta_features.metafeature_values.update( meta_features_encoded.metafeature_values) if meta_features is not None: meta_base.add_dataset(instance_id, meta_features) # Do mean imputation of the meta-features - should be done specific # for each prediction model! all_metafeatures = meta_base.get_metafeatures( features=list(meta_features.keys())) all_metafeatures.fillna(all_metafeatures.mean(), inplace=True) metalearning_configurations = self.collect_metalearning_suggestions( meta_base) if metalearning_configurations is None: metalearning_configurations = [] self.reset_data_manager() self.logger.info('%s', meta_features) # Convert meta-features into a dictionary because the scenario # expects a dictionary meta_features_dict = {} for dataset, series in all_metafeatures.iterrows(): meta_features_dict[dataset] = series.values meta_features_list = [] for meta_feature_name in all_metafeatures.columns: meta_features_list.append(meta_features[meta_feature_name].value) meta_features_list = np.array(meta_features_list).reshape((1, -1)) self.logger.info(list(meta_features_dict.keys())) meta_runs = meta_base.get_all_runs(METRIC_TO_STRING[self.metric]) meta_runs_index = 0 try: meta_durations = meta_base.get_all_runs('runtime') read_runtime_data = True except KeyError: read_runtime_data = False self.logger.critical('Cannot read runtime data.') if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' for meta_dataset in meta_runs.index: meta_dataset_start_index = meta_runs_index for meta_configuration in meta_runs.columns: if np.isfinite(meta_runs.loc[meta_dataset, meta_configuration]): try: config = meta_base.get_configuration_from_algorithm_index( meta_configuration) cost = meta_runs.loc[meta_dataset, meta_configuration] if read_runtime_data: runtime = meta_durations.loc[meta_dataset, meta_configuration] else: runtime = 1 # TODO read out other status types! meta_runhistory.add(config, cost, runtime, StatusType.SUCCESS, instance_id=meta_dataset) meta_runs_index += 1 except: # TODO maybe add warning pass meta_runs_dataset_indices[meta_dataset] = ( meta_dataset_start_index, meta_runs_index) else: if self.acquisition_function == 'EIPS': self.logger.critical('Reverting to acquisition function EI!') self.acquisition_function = 'EI' meta_features_list = [] meta_features_dict = {} metalearning_configurations = [] self.scenario = AutoMLScenario(self.config_space, self.total_walltime_limit, self.func_eval_time_limit, meta_features_dict, self.tmp_dir, self.shared_mode) types = get_types(self.config_space, self.scenario.feature_array) if self.acquisition_function == 'EI': rh2EPM = RunHistory2EPM4Cost(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = RandomForestWithInstances(types, instance_features=meta_features_list, seed=1, num_trees=10) smac = SMBO(self.scenario, model=model, rng=seed) elif self.acquisition_function == 'EIPS': rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, scenario=self.scenario, success_states=None, impute_censored_data=False, impute_state=None) model = UncorrelatedMultiObjectiveRandomForestWithInstances( ['cost', 'runtime'], types, num_trees = 10, instance_features=meta_features_list, seed=1) acquisition_function = EIPS(model) smac = SMBO(self.scenario, acquisition_function=acquisition_function, model=model, runhistory2epm=rh2EPM, rng=seed) else: raise ValueError('Unknown acquisition function value %s!' % self.acquisition_function) # Build a runtime model # runtime_rf = RandomForestWithInstances(types, # instance_features=meta_features_list, # seed=1, num_trees=10) # runtime_rh2EPM = RunHistory2EPM4EIPS(num_params=num_params, # scenario=self.scenario, # success_states=None, # impute_censored_data=False, # impute_state=None) # X_runtime, y_runtime = runtime_rh2EPM.transform(meta_runhistory) # runtime_rf.train(X_runtime, y_runtime[:, 1].flatten()) X_meta, Y_meta = rh2EPM.transform(meta_runhistory) # Transform Y_meta on a per-dataset base for meta_dataset in meta_runs_dataset_indices: start_index, end_index = meta_runs_dataset_indices[meta_dataset] end_index += 1 # Python indexing Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] >2.0] = 2.0 dataset_minimum = np.min(Y_meta[start_index:end_index, 0]) Y_meta[start_index:end_index, 0] = 1 - ( (1. - Y_meta[start_index:end_index, 0]) / (1. - dataset_minimum)) Y_meta[start_index:end_index, 0]\ [Y_meta[start_index:end_index, 0] > 2] = 2 # == first, evaluate all metelearning and default configurations for i, next_config in enumerate(([default_cfg] + metalearning_configurations)): # Do not evaluate default configurations more than once if i >= len([default_cfg]) and next_config in [default_cfg]: continue config_name = 'meta-learning' if i >= len([default_cfg]) \ else 'default' self.logger.info("Starting to evaluate %d. configuration " "(%s configuration) with time limit %ds.", num_run, config_name, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) self.logger.info("Finished evaluating %d. configuration. " "Duration %f; loss %f; status %s; additional run " "info: %s ", num_run, duration, result, str(status), additional_run_info) num_run += 1 if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed) # == after metalearning run SMAC loop smac.runhistory = run_history smac_iter = 0 finished = False while not finished: if self.scenario.shared_model: pSMAC.read(run_history=run_history, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) next_configs = [] time_for_choose_next = -1 try: X_cfg, Y_cfg = rh2EPM.transform(run_history) if not run_history.empty(): # Update costs by normalization dataset_minimum = np.min(Y_cfg[:, 0]) Y_cfg[:, 0] = 1 - ((1. - Y_cfg[:, 0]) / (1. - dataset_minimum)) Y_cfg[:, 0][Y_cfg[:, 0] > 2] = 2 if len(X_meta) > 0 and len(X_cfg) > 0: pass #X_cfg = np.concatenate((X_meta, X_cfg)) #Y_cfg = np.concatenate((Y_meta, Y_cfg)) elif len(X_meta) > 0: X_cfg = X_meta.copy() Y_cfg = Y_meta.copy() elif len(X_cfg) > 0: X_cfg = X_cfg.copy() Y_cfg = Y_cfg.copy() else: raise ValueError('No training data for SMAC random forest!') self.logger.info('Using %d training points for SMAC.' % X_cfg.shape[0]) choose_next_start_time = time.time() next_configs_tmp = smac.choose_next(X_cfg, Y_cfg, num_interleaved_random=110, num_configurations_by_local_search=10, num_configurations_by_random_search_sorted=100) time_for_choose_next = time.time() - choose_next_start_time self.logger.info('Used %g seconds to find next ' 'configurations' % (time_for_choose_next)) next_configs.extend(next_configs_tmp) # TODO put Exception here! except Exception as e: self.logger.error(e) self.logger.error("Error in getting next configurations " "with SMAC. Using random configuration!") next_config = self.config_space.sample_configuration() next_configs.append(next_config) models_fitted_this_iteration = 0 start_time_this_iteration = time.time() for next_config in next_configs: x_runtime = impute_inactive_values(next_config) x_runtime = impute_inactive_values(x_runtime).get_array() # predicted_runtime = runtime_rf.predict_marginalized_over_instances( # x_runtime.reshape((1, -1))) # predicted_runtime = np.exp(predicted_runtime[0][0][0]) - 1 self.logger.info("Starting to evaluate %d. configuration (from " "SMAC) with time limit %ds.", num_run, self.func_eval_time_limit) self.logger.info(next_config) self.reset_data_manager() info = eval_with_limits(self.datamanager, self.tmp_dir, next_config, seed, num_run, self.resampling_strategy, self.resampling_strategy_args, self.memory_limit, self.func_eval_time_limit) (duration, result, _, additional_run_info, status) = info run_history.add(config=next_config, cost=result, time=duration , status=status, instance_id=instance_id, seed=seed) run_history.update_cost(next_config, result) #self.logger.info('Predicted runtime %g, true runtime %g', # predicted_runtime, duration) # TODO add unittest to make sure everything works fine and # this does not get outdated! if smac.incumbent is None: smac.incumbent = next_config elif result < run_history.get_cost(smac.incumbent): smac.incumbent = next_config self.logger.info("Finished evaluating %d. configuration. " "Duration: %f; loss: %f; status %s; additional " "run info: %s ", num_run, duration, result, str(status), additional_run_info) smac_iter += 1 num_run += 1 models_fitted_this_iteration += 1 time_used_this_iteration = time.time() - start_time_this_iteration if models_fitted_this_iteration >= 2 and \ time_for_choose_next > 0 and \ time_used_this_iteration > time_for_choose_next: break elif time_for_choose_next <= 0 and \ models_fitted_this_iteration >= 1: break elif models_fitted_this_iteration >= 50: break if max_iters is not None: finished = (smac_iter < max_iters) if self.scenario.shared_model: pSMAC.write(run_history=run_history, output_directory=self.scenario.output_dir, num_run=self.seed)