def test_predict_mocked(self, rf_mock): """Use mock to count the number of calls to _predict""" class SideEffect(object): def __init__(self): self.counter = 0 def __call__(self, X): self.counter += 1 # Return mean and variance return self.counter, self.counter rf_mock.side_effect = SideEffect() rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint)) model.train(X[:10], Y[:10]) m_hat, v_hat = model.predict(X[10:]) self.assertEqual(m_hat.shape, (10, 1)) self.assertEqual(v_hat.shape, (10, 1)) self.assertEqual(rf_mock.call_count, 10) for i in range(10): self.assertEqual(m_hat[i], i + 1) self.assertEqual(v_hat[i], i + 1)
def test_predict_with_actual_values(self): print() X = np.array([[0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.], [1., 0., 0.], [1., 0., 1.], [1., 1., 0.], [1., 1., 1.]], dtype=np.float64) y = np.array( [[.1], [.2], [9], [9.2], [100.], [100.2], [109.], [109.2]], dtype=np.float64) # print(X.shape, y.shape) model = RandomForestWithInstances(types=np.array([0, 0, 0], dtype=np.uint), bounds=np.array([(0, np.nan), (0, np.nan), (0, np.nan)], dtype=object), instance_features=None, seed=12345) model.train(np.vstack((X, X, X, X, X, X, X, X)), np.vstack((y, y, y, y, y, y, y, y))) # for idx, x in enumerate(X): # print(model.rf.all_leaf_values(x)) # print(x, model.predict(np.array([x]))[0], y[idx]) y_hat, _ = model.predict(X) for y_i, y_hat_i in zip( y.reshape((1, -1)).flatten(), y_hat.reshape((1, -1)).flatten()): # print(y_i, y_hat_i) self.assertAlmostEqual(y_i, y_hat_i, delta=0.1)
def test_predict_with_actual_values(self): X = np.array([ [0., 0., 0.], [0., 0., 1.], [0., 1., 0.], [0., 1., 1.], [1., 0., 0.], [1., 0., 1.], [1., 1., 0.], [1., 1., 1.]], dtype=np.float64) y = np.array([ [.1], [.2], [9], [9.2], [100.], [100.2], [109.], [109.2]], dtype=np.float64) model = RandomForestWithInstances( configspace=self._get_cs(3), types=np.array([0, 0, 0], dtype=np.uint), bounds=[(0, np.nan), (0, np.nan), (0, np.nan)], instance_features=None, seed=12345, ratio_features=1.0, ) model.train(np.vstack((X, X, X, X, X, X, X, X)), np.vstack((y, y, y, y, y, y, y, y))) y_hat, _ = model.predict(X) for y_i, y_hat_i in zip(y.reshape((1, -1)).flatten(), y_hat.reshape((1, -1)).flatten()): self.assertAlmostEqual(y_i, y_hat_i, delta=0.1)
def test_predict_marginalized_over_instances_mocked(self, rf_mock): """Use mock to count the number of calls to predict()""" class SideEffect(object): def __call__(self, X): # Numpy array of number 0 to X.shape[0] rval = np.array(list(range(X.shape[0]))).reshape((-1, 1)) # Return mean and variance return rval, rval rf_mock.side_effect = SideEffect() rs = np.random.RandomState(1) F = rs.rand(10, 5) model = RandomForestWithInstances( configspace=self._get_cs(10), types=np.zeros((15, ), dtype=np.uint), instance_features=F, bounds=list(map(lambda x: (0, 10), range(10))), seed=1, ) X = rs.rand(20, 10) F = rs.rand(10, 5) Y = rs.randint(1, size=(len(X) * len(F), 1)) * 1. X_ = rs.rand(200, 15) model.train(X_, Y) means, vars = model.predict_marginalized_over_instances(rs.rand( 11, 10)) # expected to be 0 as the predict is replaced by manual unloggin the trees self.assertEqual(rf_mock.call_count, 0) self.assertEqual(means.shape, (11, 1)) self.assertEqual(vars.shape, (11, 1)) for i in range(11): self.assertEqual(means[i], 0.) self.assertEqual(vars[i], 1.e-10)
def test_predict(self): rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint)) model.train(X[:10], Y[:10]) m_hat, v_hat = model.predict(X[10:]) self.assertEqual(m_hat.shape, (10, 1)) self.assertEqual(v_hat.shape, (10, 1))
def test_predict_marginalized_over_instances_no_features(self, rf_mock): """The RF should fall back to the regular predict() method.""" rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint)) model.train(X[:10], Y[:10]) model.predict(X[10:]) self.assertEqual(rf_mock.call_count, 1)
def test_rf_on_sklearn_data(self): import sklearn.datasets X, y = sklearn.datasets.load_boston(return_X_y=True) rs = np.random.RandomState(1) types = np.zeros(X.shape[1]) bounds = [(np.min(X[:, i]), np.max(X[:, i])) for i in range(X.shape[1])] cv = sklearn.model_selection.KFold(shuffle=True, random_state=rs, n_splits=2) for do_log in [False, True]: if do_log: targets = np.log(y) model = RandomForestWithInstances( configspace=self._get_cs(X.shape[1]), types=types, bounds=bounds, seed=1, ratio_features=1.0, pca_components=100, log_y=True, ) maes = [0.43169704431695493156, 0.4267519520332511912] else: targets = y model = RandomForestWithInstances( configspace=self._get_cs(X.shape[1]), types=types, bounds=bounds, seed=1, ratio_features=1.0, pca_components=100, ) maes = [9.3298376833224042496, 9.348010654109179346] for i, (train_split, test_split) in enumerate(cv.split(X, targets)): X_train = X[train_split] y_train = targets[train_split] X_test = X[test_split] y_test = targets[test_split] model.train(X_train, y_train) y_hat, mu_hat = model.predict(X_test) mae = np.mean(np.abs(y_hat - y_test), dtype=np.float128) self.assertAlmostEqual( mae, maes[i], msg=('Do log: %s, iteration %i' % (str(do_log), i)), # We observe a difference of around 0.00017 # in github actions if doing log places=3 if do_log else 7)
def test__predict(self): rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances(np.zeros((10, ), dtype=np.uint)) model.train(X[:10], Y[:10]) m_hat, v_hat = model._predict(X[10]) self.assertIsInstance(m_hat, float) self.assertIsInstance(v_hat, float) self.assertRaisesRegexp( ValueError, 'Buffer has wrong number of ' 'dimensions \(expected 1, got 2\)', model._predict, X[10:])
def test_predict(self): rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances( types=np.zeros((10, ), dtype=np.uint), bounds=list(map(lambda x: (0, 10), range(10))), ) model.train(X[:10], Y[:10]) m_hat, v_hat = model.predict(X[10:]) self.assertEqual(m_hat.shape, (10, 1)) self.assertEqual(v_hat.shape, (10, 1))
def test_predict_marginalized_over_instances(self): rs = np.random.RandomState(1) X = rs.rand(20, 10) F = rs.rand(10, 5) Y = rs.rand(len(X) * len(F), 1) X_ = rs.rand(200, 15) model = RandomForestWithInstances(np.zeros((15, ), dtype=np.uint), instance_features=F) model.train(X_, Y) means, vars = model.predict_marginalized_over_instances(X) self.assertEqual(means.shape, (20, 1)) self.assertEqual(vars.shape, (20, 1))
def test_with_ordinal(self): cs = smac.configspace.ConfigurationSpace() _ = cs.add_hyperparameter( CategoricalHyperparameter('a', [0, 1], default_value=0)) _ = cs.add_hyperparameter( OrdinalHyperparameter('b', [0, 1], default_value=1)) _ = cs.add_hyperparameter( UniformFloatHyperparameter('c', lower=0., upper=1., default_value=1)) _ = cs.add_hyperparameter( UniformIntegerHyperparameter('d', lower=0, upper=10, default_value=1)) cs.seed(1) feat_array = np.array([0, 0, 0]).reshape(1, -1) types, bounds = get_types(cs, feat_array) model = RandomForestWithInstances( configspace=cs, types=types, bounds=bounds, instance_features=feat_array, seed=1, ratio_features=1.0, pca_components=9, ) self.assertEqual(bounds[0][0], 2) self.assertTrue(bounds[0][1] is np.nan) self.assertEqual(bounds[1][0], 0) self.assertEqual(bounds[1][1], 1) self.assertEqual(bounds[2][0], 0.) self.assertEqual(bounds[2][1], 1.) self.assertEqual(bounds[3][0], 0.) self.assertEqual(bounds[3][1], 1.) X = np.array( [[0., 0., 0., 0., 0., 0., 0.], [0., 0., 1., 0., 0., 0., 0.], [0., 1., 0., 9., 0., 0., 0.], [0., 1., 1., 4., 0., 0., 0.]], dtype=np.float64) y = np.array([0, 1, 2, 3], dtype=np.float64) X_train = np.vstack((X, X, X, X, X, X, X, X, X, X)) y_train = np.vstack((y, y, y, y, y, y, y, y, y, y)) model.train(X_train, y_train.reshape((-1, 1))) mean, _ = model.predict(X) for idx, m in enumerate(mean): self.assertAlmostEqual(y[idx], m, 0.05)
def test_predict_marginalized_over_instances_no_features(self, rf_mock): """The RF should fall back to the regular predict() method.""" rs = np.random.RandomState(1) X = rs.rand(20, 10) Y = rs.rand(10, 1) model = RandomForestWithInstances( configspace=self._get_cs(10), types=np.zeros((10, ), dtype=np.uint), bounds=list(map(lambda x: (0, 10), range(10))), seed=1, ) model.train(X[:10], Y[:10]) model.predict(X[10:]) self.assertEqual(rf_mock.call_count, 1)
def test_train_with_pca(self): rs = np.random.RandomState(1) X = rs.rand(20, 20) F = rs.rand(10, 10) Y = rs.rand(20, 1) model = RandomForestWithInstances( types=np.zeros((20, ), dtype=np.uint), bounds=list(map(lambda x: (0, 10), range(10))), pca_components=2, instance_features=F, ) model.train(X, Y) self.assertEqual(model.n_params, 10) self.assertEqual(model.n_feats, 10) self.assertIsNotNone(model.pca) self.assertIsNotNone(model.scaler)
def test_predict_marginalized_over_instances(self): rs = np.random.RandomState(1) X = rs.rand(20, 10) F = rs.rand(10, 5) Y = rs.rand(len(X) * len(F), 1) X_ = rs.rand(200, 15) model = RandomForestWithInstances( configspace=self._get_cs(10), types=np.zeros((15, ), dtype=np.uint), instance_features=F, bounds=list(map(lambda x: (0, 10), range(10))), seed=1, ) model.train(X_, Y) means, vars = model.predict_marginalized_over_instances(X) self.assertEqual(means.shape, (20, 1)) self.assertEqual(vars.shape, (20, 1))
def get_pred_surface(self, rh, X_scaled, conf_list: list, contour_step_size): """fit epm on the scaled input dimension and return data to plot a contour plot of the empirical performance Parameters ---------- rh: RunHistory runhistory X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects Returns ------- contour_data: (np.array, np.array, np.array) x, y, Z for contour plots """ # use PCA to reduce features to also at most 2 dims scen = copy.deepcopy(self.scenario) # pca changes feats if scen.feature_array.shape[1] > 2: self.logger.debug( "Use PCA to reduce features to from %d dim to 2 dim", scen.feature_array.shape[1]) # perform PCA insts = scen.feature_dict.keys() feature_array = np.array([scen.feature_dict[i] for i in insts]) feature_array = StandardScaler().fit_transform(feature_array) feature_array = PCA(n_components=2).fit_transform(feature_array) # inject in scenario-object scen.feature_array = feature_array scen.feature_dict = dict([(inst, feature_array[idx, :]) for idx, inst in enumerate(insts)]) scen.n_features = 2 # convert the data to train EPM on 2-dim featurespace (for contour-data) self.logger.debug("Convert data for epm.") X, y, types = convert_data_for_epm(scenario=scen, runhistory=rh, logger=self.logger) types = np.array(np.zeros((2 + scen.feature_array.shape[1])), dtype=np.uint) num_params = len(scen.cs.get_hyperparameters()) # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions conf_dict = {} for idx, c in enumerate(conf_list): conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] X_trans = [] for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] # append scaled config + pca'ed features (total of 4 values) per config/feature-sample X_trans.append( np.concatenate((x_scaled_conf, x[num_params:]), axis=0)) X_trans = np.array(X_trans) self.logger.debug("Train random forest for contour-plot.") bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=np.array( scen.feature_array), ratio_features=1.0) start = time.time() model.train(X_trans, y) self.logger.debug("Fitting random forest took %f time", time.time() - start) x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size), np.arange(y_min, y_max, contour_step_size)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min, x_max, y_min, y_max) self.logger.debug( "Predict on %d samples in grid to get surface (step-size: %f)", np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size) start = time.time() Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) self.logger.debug("Predicting random forest took %f time", time.time() - start) return xx, yy, Z
def run(self, save_fn: str = None): ''' forward selection on SMAC's EPM (RF) wrt configuration space to minimize the out-of-bag error returned by the RF Parameters ---------- save_fn:str file name to save plot Returns ------- list tuples of parameter name and oob score ''' importance_tuples = [] X = self.X y = self.Y param_ids = list(range(len(self.params))) used = [] # always use all features used.extend(range(len(self.params), len(self.types))) pca = PCA(n_components=min(7, len(self.types) - len(self.params))) self.scen.feature_array = pca.fit_transform(self.scen.feature_array) for _ in range(self._MAX_P): scores = [] for p in param_ids: self.logger.debug(self.params[p]) used.append(p) X_l = X[:, used] model = RandomForestWithInstances(self.types[used], self.scen.feature_array) model.rf.compute_oob_error = True start = time.time() model.train(X_l, y) self.logger.debug( "End Fit RF (sec %.2f; oob: %.4f)" % (time.time() - start, model.rf.out_of_bag_error())) #============================================================== # start = time.time() # rf = RandomForestRegressor(n_estimators=30, # min_samples_split=3, # min_samples_leaf=3, # max_features=math.ceil( # (5. / 6.) * X_l.shape[1]), # max_leaf_nodes=1000, # max_depth=20, oob_score=True) # rf.fit(X_l, y.ravel()) # self.logger.debug("End Fit Sklearn RF (sec %.2f, oob: %.4f))" % ( # time.time() - start, rf.oob_score_)) #============================================================== score = model.rf.out_of_bag_error() scores.append(score) used.pop() best_indx = np.argmin(scores) best_score = scores[best_indx] p = param_ids.pop(best_indx) used.append(p) self.logger.info("%s : %.4f (OOB)" % (self.params[p].name, best_score)) importance_tuples.append((self.params[p].name, best_score)) self.plot_importance(importance_tuples=importance_tuples, save_fn=save_fn) return importance_tuples
class AbstractEvaluator(object): """ Abstract implementation of Importance evaluator """ def __init__(self, scenario: Scenario, cs: ConfigurationSpace, model: RandomForestWithInstances, to_evaluate: int, rng, verbose: bool = True, **kwargs): self._logger = None self.scenario = scenario self.cs = cs self.model = model # SMAC model self.rng = rng self.verbose = verbose if self.model is not None: if 'X' in kwargs and 'y' in kwargs: self._train_model(kwargs['X'], kwargs['y'], **kwargs) if 'features' in kwargs: self.features = kwargs['features'] else: self.features = self.model.instance_features self.X = self.model.X self.y = self.model.y self.types = self.model.types self.bounds = self.model.bounds self._to_eval = to_evaluate if to_evaluate <= 0: self.to_evaluate = len(self.cs.get_hyperparameters()) elif to_evaluate >= len(self.cs.get_hyperparameters()): self.to_evaluate = len(self.cs.get_hyperparameters()) else: self.to_evaluate = to_evaluate # num of parameters to evaluate self.evaluated_parameter_importance = OrderedDict() self.name = 'Base' self.IMPORTANCE_THRESHOLD = 0.05 self.AXIS_FONT = {'family': 'monospace'} self.LABEL_FONT = {'family': 'sans-serif'} self.LINE_FONT = {'lw': 4, 'color': (0.125, 0.125, 0.125)} self.area_color = (0.25, 0.25, 0.45) self.unimportant_area_color = (0.125, 0.125, 0.225) self.MAX_PARAMS_TO_PLOT = 15 @abc.abstractclassmethod def run(self) -> OrderedDict: raise NotImplementedError @abc.abstractclassmethod def plot_result(self, name=None): raise NotImplementedError def _train_model(self, X, y, **kwargs): self.model.train(X, y, **kwargs) def __str__(self): tmp = 'Parameter Importance Evaluation Method %s\n' % self.name tmp += '{:^15s}: {:<8s}\n'.format('Parameter', 'Value') for key in self.evaluated_parameter_importance: value = self.evaluated_parameter_importance[key] tmp += '{:>15s}: {:<3.4f}\n'.format(key, value) return tmp @property def logger(self): return self._logger @logger.setter def logger(self, value): self._logger = logging.getLogger(value) def _refit_model(self, types, bounds, X, y): """ Easily allows for refitting of the model. Parameters ---------- types: list SMAC EPM types X:ndarray X matrix y:ndarray corresponding y vector """ # We need to fake config-space bypass imputation of inactive values in random forest implementation fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint") # We need to add fake hyperparameters fake_cs.add_hyperparameters([ UniformFloatHyperparameter('fake-%s' % i, lower=0., upper=100000., default_value=0., log=False) for i in range(len(types)) ]) self.model = RandomForestWithInstances(fake_cs, types, bounds, seed=12345, do_bootstrapping=True) self.model.rf_opts.compute_oob_error = True self.model.train(X, y)
def validate_epm( self, config_mode: Union[str, typing.List[Configuration]] = 'def', instance_mode: Union[str, typing.List[str]] = 'test', repetitions: int = 1, runhistory: typing.Optional[RunHistory] = None, output_fn: typing.Optional[str] = None, reuse_epm: bool = True, ) -> RunHistory: """ Use EPM to predict costs/runtimes for unknown config/inst-pairs. side effect: if output is specified, saves runhistory to specified output directory. Parameters ---------- output_fn: str path to runhistory to be saved. if the suffix is not '.json', will be interpreted as directory and filename will be 'validated_runhistory_EPM.json' config_mode: str or list<Configuration> string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all]. time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of repetitions in nondeterministic algorithms runhistory: RunHistory optional, RunHistory-object to reuse runs reuse_epm: bool if true (and if `self.epm`), reuse epm to validate runs Returns ------- runhistory: RunHistory runhistory with predicted runs """ if not isinstance(runhistory, RunHistory) and (self.epm is None or not reuse_epm): raise ValueError( "No runhistory specified for validating with EPM!") elif not reuse_epm or self.epm is None: # Create RandomForest types, bounds = get_types( self.scen.cs, self.scen.feature_array ) # type: ignore[attr-defined] # noqa F821 epm = RandomForestWithInstances( configspace=self.scen. cs, # type: ignore[attr-defined] # noqa F821 types=types, bounds=bounds, instance_features=self.scen.feature_array, seed=self.rng.randint(MAXINT), ratio_features=1.0, ) # Use imputor if objective is runtime imputor = None impute_state = None impute_censored_data = False if self.scen.run_obj == 'runtime': threshold = self.scen.cutoff * self.scen.par_factor # type: ignore[attr-defined] # noqa F821 imputor = RFRImputator( rng=self.rng, cutoff=self.scen. cutoff, # type: ignore[attr-defined] # noqa F821 threshold=threshold, model=epm) impute_censored_data = True impute_state = [StatusType.CAPPED] success_states = [ StatusType.SUCCESS, ] else: success_states = [ StatusType.SUCCESS, StatusType.CRASHED, StatusType.MEMOUT ] # Transform training data (from given rh) rh2epm = RunHistory2EPM4Cost( num_params=len(self.scen.cs.get_hyperparameters() ), # type: ignore[attr-defined] # noqa F821 scenario=self.scen, rng=self.rng, impute_censored_data=impute_censored_data, imputor=imputor, impute_state=impute_state, success_states=success_states) assert runhistory is not None # please mypy X, y = rh2epm.transform(runhistory) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) # Train random forest epm.train(X, y) else: epm = typing.cast(RandomForestWithInstances, self.epm) # Predict desired runs runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions, runhistory) feature_array_size = len(self.scen.cs.get_hyperparameters() ) # type: ignore[attr-defined] # noqa F821 if self.scen.feature_array is not None: feature_array_size += self.scen.feature_array.shape[1] X_pred = np.empty((len(runs), feature_array_size)) for idx, run in enumerate(runs): if self.scen.feature_array is not None and run.inst is not None: X_pred[idx] = np.hstack([ convert_configurations_to_array([run.config])[0], self.scen.feature_dict[run.inst] ]) else: X_pred[idx] = convert_configurations_to_array([run.config])[0] self.logger.debug("Predicting desired %d runs, data has shape %s", len(runs), str(X_pred.shape)) y_pred = epm.predict(X_pred) self.epm = epm # Add runs to runhistory for run, pred in zip(runs, y_pred[0]): rh_epm.add( config=run.config, cost=float(pred), time=float(pred), status=StatusType.SUCCESS, instance_id=run.inst, seed=-1, additional_info={"additional_info": "ESTIMATED USING EPM!"}) if output_fn: self._save_results(rh_epm, output_fn, backup_fn="validated_runhistory_EPM.json") return rh_epm
class SMBO(BaseSolver): def __init__(self, scenario, tae_runner=None, acquisition_function=None, model=None, runhistory2epm=None, stats=None, rng=None): ''' Interface that contains the main Bayesian optimization loop Parameters ---------- scenario: smac.scenario.scenario.Scenario Scenario object tae_runner: object object that implements the following method to call the target algorithm (or any other arbitrary function): run(self, config) If not set, it will be initialized with the tae.ExecuteTARunOld() acquisition_function : AcquisitionFunction Object that implements the AbstractAcquisitionFunction. Will use EI if not set. model : object Model that implements train() and predict(). Will use a RandomForest if not set. runhistory2epm : RunHistory2EMP Object that implements the AbstractRunHistory2EPM. If None, will use RunHistory2EPM4Cost if objective is cost or RunHistory2EPM4LogCost if objective is runtime. stats: Stats optional stats object rng: numpy.random.RandomState Random number generator ''' if stats: self.stats = stats else: self.stats = Stats(scenario) self.runhistory = RunHistory() self.logger = logging.getLogger("smbo") if rng is None: self.num_run = np.random.randint(1234567980) self.rng = np.random.RandomState(seed=self.num_run) elif isinstance(rng, int): self.num_run = rng self.rng = np.random.RandomState(seed=rng) elif isinstance(rng, np.random.RandomState): self.num_run = rng.randint(1234567980) self.rng = rng else: raise TypeError('Unknown type %s for argument rng. Only accepts ' 'None, int or np.random.RandomState' % str(type(rng))) self.scenario = scenario self.config_space = scenario.cs self.traj_logger = TrajLogger(output_dir=self.scenario.output_dir, stats=self.stats) self.types = get_types(self.config_space, scenario.feature_array) if model is None: self.model = RandomForestWithInstances( self.types, scenario.feature_array, seed=self.rng.randint(1234567980)) else: self.model = model if acquisition_function is None: self.acquisition_func = EI(self.model) else: self.acquisition_func = acquisition_function self.local_search = LocalSearch(self.acquisition_func, self.config_space) self.incumbent = None if tae_runner is None: self.executor = ExecuteTARunOld(ta=scenario.ta, stats=self.stats, run_obj=scenario.run_obj, par_factor=scenario.par_factor) else: self.executor = tae_runner self.inten = Intensifier( executor=self.executor, stats=self.stats, traj_logger=self.traj_logger, instances=self.scenario.train_insts, cutoff=self.scenario.cutoff, deterministic=self.scenario.deterministic, run_obj_time=self.scenario.run_obj == "runtime", instance_specifics=self.scenario.instance_specific) num_params = len(self.config_space.get_hyperparameters()) self.objective = average_cost if self.scenario.run_obj == "runtime": if runhistory2epm is None: # if we log the performance data, # the RFRImputator will already get # log transform data from the runhistory cutoff = np.log10(self.scenario.cutoff) threshold = np.log10(self.scenario.cutoff * self.scenario.par_factor) imputor = RFRImputator(cs=self.config_space, rs=self.rng, cutoff=cutoff, threshold=threshold, model=self.model, change_threshold=0.01, max_iter=10) self.rh2EPM = RunHistory2EPM4LogCost(scenario=self.scenario, num_params=num_params, success_states=[ StatusType.SUCCESS, ], impute_censored_data=True, impute_state=[ StatusType.TIMEOUT, ], imputor=imputor) else: self.rh2EPM = runhistory2epm elif self.scenario.run_obj == 'quality': if runhistory2epm is None: self.rh2EPM = RunHistory2EPM4Cost\ (scenario=self.scenario, num_params=num_params, success_states=[StatusType.SUCCESS, ], impute_censored_data=False, impute_state=None) else: self.rh2EPM = runhistory2epm else: raise ValueError('Unknown run objective: %s. Should be either ' 'quality or runtime.' % self.scenario.run_obj) def run_initial_design(self): ''' runs algorithm runs for a initial design; default implementation: running the default configuration on a random instance-seed pair Side effect: adds runs to self.runhistory Returns ------- incumbent: Configuration() initial incumbent configuration ''' default_conf = self.config_space.get_default_configuration() self.incumbent = default_conf # add this incumbent right away to have an entry to time point 0 self.traj_logger.add_entry(train_perf=2**31, incumbent_id=1, incumbent=self.incumbent) rand_inst_id = self.rng.randint(0, len(self.scenario.train_insts)) # ignore instance specific values rand_inst = self.scenario.train_insts[rand_inst_id] if self.scenario.deterministic: initial_seed = 0 else: initial_seed = random.randint(0, MAXINT) status, cost, runtime, additional_info = self.executor.start( default_conf, instance=rand_inst, cutoff=self.scenario.cutoff, seed=initial_seed, instance_specific=self.scenario.instance_specific.get( rand_inst, "0")) if status in [StatusType.CRASHED or StatusType.ABORT]: self.logger.critical("First run crashed -- Abort") sys.exit(1) self.runhistory.add(config=default_conf, cost=cost, time=runtime, status=status, instance_id=rand_inst, seed=initial_seed, additional_info=additional_info) defaul_inst_seeds = set( self.runhistory.get_runs_for_config(default_conf)) default_perf = self.objective(default_conf, self.runhistory, defaul_inst_seeds) self.runhistory.update_cost(default_conf, default_perf) self.stats.inc_changed += 1 # first incumbent self.traj_logger.add_entry(train_perf=default_perf, incumbent_id=self.stats.inc_changed, incumbent=self.incumbent) return default_conf def run(self, max_iters=10): ''' Runs the Bayesian optimization loop for max_iters iterations Parameters ---------- max_iters: int The maximum number of iterations Returns ---------- incumbent: np.array(1, H) The best found configuration ''' self.stats.start_timing() #self.runhistory = RunHisory() self.incumbent = self.run_initial_design() # Main BO loop iteration = 1 while True: if self.scenario.shared_model: pSMAC.read(run_history=self.runhistory, output_directory=self.scenario.output_dir, configuration_space=self.config_space, logger=self.logger) start_time = time.time() X, Y = self.rh2EPM.transform(self.runhistory) self.logger.debug("Search for next configuration") # get all found configurations sorted according to acq challengers = self.choose_next(X, Y) time_spend = time.time() - start_time logging.debug( "Time spend to choose next configurations: %.2f sec" % (time_spend)) self.logger.debug("Intensify") self.incumbent, inc_perf = self.inten.intensify( challengers=challengers, incumbent=self.incumbent, run_history=self.runhistory, objective=self.objective, time_bound=max(0.01, time_spend)) # TODO: Write run history into database if self.scenario.shared_model: pSMAC.write(run_history=self.runhistory, output_directory=self.scenario.output_dir, num_run=self.num_run) if iteration == max_iters: break iteration += 1 logging.debug( "Remaining budget: %f (wallclock), %f (ta costs), %f (target runs)" % (self.stats.get_remaing_time_budget(), self.stats.get_remaining_ta_budget(), self.stats.get_remaining_ta_runs())) if self.stats.is_budget_exhausted(): break self.stats.print_stats(debug_out=True) return self.incumbent def choose_next(self, X, Y, num_interleaved_random=1010, num_configurations_by_random_search_sorted=1000, num_configurations_by_local_search=10): """Choose next candidate solution with Bayesian optimization. Parameters ---------- X : (N, D) numpy array Each row contains a configuration and one set of instance features. Y : (N, O) numpy array The function values for each configuration instance pair. Returns ------- list List of 2020 suggested configurations to evaluate. """ self.model.train(X, Y) if self.runhistory.empty(): incumbent_value = 0.0 elif self.incumbent is None: # TODO try to calculate an incumbent from the runhistory! incumbent_value = 0.0 else: incumbent_value = self.runhistory.get_cost(self.incumbent) self.acquisition_func.update(model=self.model, eta=incumbent_value) # Remove dummy acquisition function value next_configs_by_random_search = [ x[1] for x in self._get_next_by_random_search( num_points=num_interleaved_random) ] # Get configurations sorted by EI next_configs_by_random_search_sorted = \ self._get_next_by_random_search( num_configurations_by_random_search_sorted, _sorted=True) next_configs_by_local_search = \ self._get_next_by_local_search(num_configurations_by_local_search) next_configs_by_acq_value = next_configs_by_random_search_sorted + \ next_configs_by_local_search next_configs_by_acq_value.sort(reverse=True, key=lambda x: x[0]) self.logger.debug( "First 10 acq func values of selected configurations: %s" % (str([_[0] for _ in next_configs_by_acq_value[:10]]))) next_configs_by_acq_value = [_[1] for _ in next_configs_by_acq_value] challengers = list( itertools.chain(*zip(next_configs_by_acq_value, next_configs_by_random_search))) return challengers def _get_next_by_random_search(self, num_points=1000, _sorted=False): """Get candidate solutions via local search. Parameters ---------- num_points : int, optional (default=10) Number of local searches and returned values. _sorted : bool, optional (default=True) Whether to sort the candidate solutions by acquisition function value. Returns ------- list : (acquisition value, Candidate solutions) """ rand_configs = self.config_space.sample_configuration(size=num_points) if _sorted: imputed_rand_configs = map(ConfigSpace.util.impute_inactive_values, rand_configs) imputed_rand_configs = [ x.get_array() for x in imputed_rand_configs ] imputed_rand_configs = np.array(imputed_rand_configs, dtype=np.float64) acq_values = self.acquisition_func(imputed_rand_configs) # From here # http://stackoverflow.com/questions/20197990/how-to-make-argsort-result-to-be-random-between-equal-values random = self.rng.rand(len(acq_values)) # Last column is primary sort key! indices = np.lexsort((random.flatten(), acq_values.flatten())) for i in range(len(rand_configs)): rand_configs[i].origin = 'Random Search (sorted)' # Cannot use zip here because the indices array cannot index the # rand_configs list, because the second is a pure python list return [(acq_values[ind][0], rand_configs[ind]) for ind in indices[::-1]] else: for i in range(len(rand_configs)): rand_configs[i].origin = 'Random Search' return [(0, rand_configs[i]) for i in range(len(rand_configs))] def _get_next_by_local_search(self, num_points=10): """Get candidate solutions via local search. In case acquisition function values tie, these will be broken randomly. Parameters ---------- num_points : int, optional (default=10) Number of local searches and returned values. Returns ------- list : (acquisition value, Candidate solutions), ordered by their acquisition function value """ configs_acq = [] # Start N local search from different random start points for i in range(num_points): if i == 0 and self.incumbent is not None: start_point = self.incumbent else: start_point = self.config_space.sample_configuration() configuration, acq_val = self.local_search.maximize(start_point) configuration.origin = 'Local Search' configs_acq.append((acq_val[0][0], configuration)) # shuffle for random tie-break random.shuffle(configs_acq, self.rng.rand) # sort according to acq value # and return n best configurations configs_acq.sort(reverse=True, key=lambda x: x[0]) return configs_acq
def get_pred_surface(self, X_scaled, conf_list: list): ''' fit epm on the scaled input dimension and return data to plot a contour plot Parameters ---------- X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects Returns ------- np.array, np.array, np.array x,y,Z for contour plots ''' # use PCA to reduce features to also at most 2 dims n_feats = self.scenario.feature_array.shape[1] if n_feats > 2: self.logger.debug("Use PCA to reduce features to 2dim") insts = self.scenario.feature_dict.keys() feature_array = np.array([self.scenario.feature_dict[inst] for inst in insts]) ss = StandardScaler() self.scenario.feature_array = ss.fit_transform(feature_array) pca = PCA(n_components=2) feature_array = pca.fit_transform(feature_array) n_feats = feature_array.shape[1] self.scenario.feature_array = feature_array self.scenario.feature_dict = dict([(inst, feature_array[idx,:]) for idx, inst in enumerate(insts)]) self.scenario.n_features = 2 # Create new rh with only wanted configs new_rh = RunHistory(average_cost) for rh in self.runhistories: for key, value in rh.data.items(): config = rh.ids_config[key.config_id] if config in self.configs_to_plot: config_id, instance, seed = key cost, time, status, additional_info = value new_rh.add(config, cost, time, status, instance_id=instance, seed=seed, additional_info=additional_info) self.relevant_rh = new_rh X, y, types = convert_data(scenario=self.scenario, runhistory=new_rh) types = np.array(np.zeros((2+n_feats)), dtype=np.uint) num_params = len(self.scenario.cs.get_hyperparameters()) # impute missing values in configs conf_dict = {} for idx, c in enumerate(conf_list): conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] X_trans = [] for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] x_new = np.concatenate( (x_scaled_conf, x[num_params:]), axis=0) X_trans.append(x_new) X_trans = np.array(X_trans) bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(types=types, bounds=bounds, instance_features=np.array(self.scenario.feature_array), ratio_features=1.0) model.train(X_trans, y) self.logger.debug("RF fitted") plot_step = self.contour_step_size x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), np.arange(y_min, y_max, plot_step)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f" %(x_min, x_max, y_min, y_max)) self.logger.debug("Predict on %d samples in grid to get surface" %(np.c_[xx.ravel(), yy.ravel()].shape[0])) Z, _ = model.predict_marginalized_over_instances( np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) return xx, yy, Z
def plot_cost_over_time(self, rh, traj, output="performance_over_time.png", validator=None): """ Plot performance over time, using all trajectory entries with max_time = wallclock_limit or (if inf) the highest recorded time Parameters ---------- rh: RunHistory runhistory to use traj: List trajectory to take times/incumbents from output: str path to output-png epm: RandomForestWithInstances emperical performance model (expecting trained on all runs) """ self.logger.debug("Estimating costs over time for best run.") validator.traj = traj # set trajectory time, configs = [], [] for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) self.logger.debug("Using %d samples (%d distinct) from trajectory.", len(time), len(set(configs))) if validator.epm: # not log as validator epm is trained on cost, not log cost epm = validator.epm else: self.logger.debug( "No EPM passed! Training new one from runhistory.") # Train random forest and transform training data (from given rh) # Not using validator because we want to plot uncertainties rh2epm = RunHistory2EPM4Cost(num_params=len( self.scenario.cs.get_hyperparameters()), scenario=self.scenario) X, y = rh2epm.transform(rh) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) epm = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scenario.feature_array, #seed=self.rng.randint(MAXINT), ratio_features=1.0) epm.train(X, y) ## not necessary right now since the EPM only knows the features ## of the training instances # use only training instances #======================================================================= # if self.scenario.feature_dict: # feat_array = [] # for inst in self.scenario.train_insts: # feat_array.append(self.scenario.feature_dict[inst]) # backup_features_epm = epm.instance_features # epm.instance_features = np.array(feat_array) #======================================================================= # predict performance for all configurations in trajectory config_array = convert_configurations_to_array(configs) mean, var = epm.predict_marginalized_over_instances(config_array) #======================================================================= # # restore feature array in epm # if self.scenario.feature_dict: # epm.instance_features = backup_features_epm #======================================================================= mean = mean[:, 0] var = var[:, 0] uncertainty_upper = mean + np.sqrt(var) uncertainty_lower = mean - np.sqrt(var) if self.scenario.run_obj == 'runtime': # We have to clip at 0 as we want to put y on the logscale uncertainty_lower[uncertainty_lower < 0] = 0 uncertainty_upper[uncertainty_upper < 0] = 0 # plot fig = plt.figure() ax = fig.add_subplot(111) ax.set_ylabel('performance') ax.set_xlabel('time [sec]') ax.plot(time, mean, 'r-', label="estimated performance") ax.fill_between(time, uncertainty_upper, uncertainty_lower, alpha=0.8, label="standard deviation") ax.set_xscale("log", nonposx='clip') if self.scenario.run_obj == 'runtime': ax.set_yscale('log') # ax.set_ylim(min(mean)*0.8, max(mean)*1.2) # start after 1% of the configuration budget ax.set_xlim(min(time) + (max(time) - min(time)) * 0.01, max(time)) ax.legend() plt.tight_layout() fig.savefig(output) plt.close(fig)
class FeatureForwardSelector(): """ Inspired by forward selection of ParameterImportance-package. """ def __init__(self, scenario, runhistory, to_evaluate: int = 3, rng=None): """ Constructor :parameter: scenario SMAC scenario object to_evaluate int. Indicates for how many parameters the Importance values have to be computed """ self.logger = logging.getLogger(self.__module__ + '.' + self.__class__.__name__) self.rng = rng if rng is None: self.rng = np.random.RandomState(42) self.scenario = copy.deepcopy(scenario) self.cs = scenario.cs self.rh = runhistory self.to_evaluate = to_evaluate self.MAX_SAMPLES = 100000 self.model = None def run(self): """ Implementation of the forward selection loop. Uses SMACs EPM (RF) wrt the feature space to minimize the OOB error. Returns ------- feature_importance: OrderedDict dict_keys (first key -> most important) -> OOB error """ parameters = [p.name for p in self.scenario.cs.get_hyperparameters()] self.logger.debug("Parameters: %s", parameters) rh2epm = RunHistory2EPM4Cost(scenario=self.scenario, num_params=len(parameters), success_states=[ StatusType.SUCCESS, StatusType.CAPPED, StatusType.CRASHED ], impute_censored_data=False, impute_state=None) X, y = rh2epm.transform(self.rh) # reduce sample size to speedup computation if X.shape[0] > self.MAX_SAMPLES: idx = np.random.choice(X.shape[0], size=self.MAX_SAMPLES, replace=False) X = X[idx, :] y = y[idx] self.logger.debug( "Shape of X: %s, of y: %s, #parameters: %s, #feats: %s", X.shape, y.shape, len(parameters), len(self.scenario.feature_names)) names = copy.deepcopy(self.scenario.feature_names) self.logger.debug("Features: %s", names) used = list(range(0, len(parameters))) feat_ids = {f: i for i, f in enumerate(names, len(used))} ids_feat = {i: f for f, i in feat_ids.items()} self.logger.debug("Used: %s", used) evaluated_feature_importance = OrderedDict() types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) last_error = np.inf for _round in range(self.to_evaluate): # Main Loop errors = [] for f in names: i = feat_ids[f] self.logger.debug('Evaluating %s', f) used.append(i) self.logger.debug( 'Used features: %s', str([ids_feat[j] for j in used[len(parameters):]])) start = time.time() self._refit_model(types[sorted(used)], bounds, X[:, sorted(used)], y) # refit the model every round errors.append(self.model.rf.out_of_bag_error()) used.pop() self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' % (time.time() - start, errors[-1])) else: self.logger.debug('Evaluating None') start = time.time() self._refit_model(types[sorted(used)], bounds, X[:, sorted(used)], y) # refit the model every round errors.append(self.model.rf.out_of_bag_error()) self.logger.debug('Refitted RF (sec %.2f; error: %.4f)' % (time.time() - start, errors[-1])) if _round == 0: evaluated_feature_importance['None'] = errors[-1] best_idx = np.argmin(errors) lowest_error = errors[best_idx] if best_idx == len(errors) - 1: self.logger.info('Best thing to do is add nothing') best_feature = 'None' # evaluated_feature_importance[best_feature] = lowest_error break elif lowest_error >= last_error: break else: last_error = lowest_error best_feature = names.pop(best_idx) used.append(feat_ids[best_feature]) self.logger.debug('%s: %.4f' % (best_feature, lowest_error)) evaluated_feature_importance[best_feature] = lowest_error self.logger.debug(evaluated_feature_importance) self.evaluated_feature_importance = evaluated_feature_importance return evaluated_feature_importance def _refit_model(self, types, bounds, X, y): """ Easily allows for refitting of the model. Parameters ---------- types: list SMAC EPM types X:ndarray X matrix y:ndarray corresponding y vector """ # take at most 80% of the data per split to ensure enough data for oob error self.model = RandomForestWithInstances(self.cs, types=types, bounds=bounds, seed=self.rng.randint(MAXINT), do_bootstrapping=True, n_points_per_tree=int( X.shape[1] * 0.8)) self.model.rf_opts.compute_oob_error = True self.model.train(X, y) def _plot_result(self, output_fn, bar=True): """ plot oob score as bar charts Parameters ---------- name file name to save plot """ fig, ax = plt.subplots() features = list(self.evaluated_feature_importance.keys()) errors = list(self.evaluated_feature_importance.values()) max_to_plot = min(len(errors), 5) ind = np.arange(len(errors)) if bar: ax.bar(ind, errors, color=(0.25, 0.25, 0.45)) else: ax.plot(ind, errors, lw=4, color=(0.125, 0.125, 0.125)) ax.set_ylabel('error', size='24', family='sans-serif') if bar: ax.set_xticks(ind) ax.set_xlim(-.5, max_to_plot - 0.5) else: ax.set_xticks(ind) ax.set_xlim(0, max_to_plot - 1) ax.set_xticklabels(features, rotation=30, ha='right', size='10', family='monospace') ax.xaxis.grid(True) ax.yaxis.grid(True) plt.tight_layout() out_dir = os.path.dirname(output_fn) if not os.path.exists(out_dir): os.makedirs(out_dir) fig.savefig(output_fn) return output_fn def plot_result(self, output_fn=None): plot_paths = [] plot_paths.append(self._plot_result(output_fn + '-barplot.png', True)) plot_paths.append(self._plot_result(output_fn + '-chng.png', False)) plt.close('all') self.logger.debug('Saved plot as %s-[barplot|chng].png' % output_fn) return plot_paths
class Validator(object): """ Validator for the output of SMAC-scenarios. Evaluates specified configurations on specified instances. """ def __init__(self, scenario: Scenario, trajectory: list, rng: Union[np.random.RandomState, int] = None): """ Construct Validator for given scenario and trajectory. Parameters ---------- scenario: Scenario scenario object for cutoff, instances, features and specifics trajectory: trajectory-list trajectory to take incumbent(s) from rng: np.random.RandomState or int Random number generator or seed """ self.logger = logging.getLogger(self.__module__ + "." + self.__class__.__name__) self.traj = trajectory self.scen = scenario self.epm = None if isinstance(rng, np.random.RandomState): self.rng = rng elif isinstance(rng, int): self.rng = np.random.RandomState(seed=rng) else: self.logger.debug('no seed given, using default seed of 1') num_run = 1 self.rng = np.random.RandomState(seed=num_run) def _save_results(self, rh: RunHistory, output_fn, backup_fn=None): """ Helper to save results to file Parameters ---------- rh: RunHistory runhistory to save output_fn: str if ends on '.json': filename to save history to else: directory to save runhistory to (filename is backup_fn) backup_fn: str if output_fn does not end on '.json', treat output_fn as dir and append backup_fn as filename (if output_fn ends on '.json', this argument is ignored) """ if output_fn == "": self.logger.info( "No output specified, validated runhistory not saved.") return # Check if a folder or a file is specified as output if not output_fn.endswith('.json'): output_dir = output_fn output_fn = os.path.join(output_dir, backup_fn) self.logger.debug("Output is \"%s\", changing to \"%s\"!", output_dir, output_fn) base = os.path.split(output_fn)[0] if not base == "" and not os.path.exists(base): self.logger.debug("Folder (\"%s\") doesn't exist, creating.", base) os.makedirs(base) rh.save_json(output_fn) self.logger.info("Saving validation-results in %s", output_fn) def validate( self, config_mode: Union[str, typing.List[Configuration]] = 'def', instance_mode: Union[str, typing.List[str]] = 'test', repetitions: int = 1, n_jobs: int = 1, backend: str = 'threading', runhistory: RunHistory = None, tae: ExecuteTARun = None, output_fn: str = "", ) -> RunHistory: """ Validate configs on instances and save result in runhistory. If a runhistory is provided as input it is important that you run it on the same/comparable hardware. side effect: if output is specified, saves runhistory to specified output directory. Parameters ---------- config_mode: str or list<Configuration> string or directly a list of Configuration. string from [def, inc, def+inc, wallclock_time, cpu_time, all]. time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of repetitions in nondeterministic algorithms n_jobs: int number of parallel processes used by joblib backend: str what backend joblib should use for parallel runs runhistory: RunHistory optional, RunHistory-object to reuse runs tae: ExecuteTARun tae to be used. if None, will initialize ExecuteTARunOld output_fn: str path to runhistory to be saved. if the suffix is not '.json', will be interpreted as directory and filename will be 'validated_runhistory.json' Returns ------- runhistory: RunHistory runhistory with validated runs """ self.logger.debug( "Validating configs '%s' on instances '%s', repeating %d times" " with %d parallel runs on backend '%s'.", config_mode, instance_mode, repetitions, n_jobs, backend) # Get all runs to be evaluated as list runs, validated_rh = self._get_runs(config_mode, instance_mode, repetitions, runhistory) # Create new Stats without limits inf_scen = Scenario({ 'run_obj': self.scen.run_obj, 'cutoff_time': self.scen.cutoff, 'output_dir': "" }) inf_stats = Stats(inf_scen) inf_stats.start_timing() # Create TAE if not tae: tae = ExecuteTARunOld(ta=self.scen.ta, stats=inf_stats, run_obj=self.scen.run_obj, par_factor=self.scen.par_factor, cost_for_crash=self.scen.cost_for_crash) else: # Inject endless-stats tae.stats = inf_stats # Validate! run_results = self._validate_parallel(tae, runs, n_jobs, backend) # tae returns (status, cost, runtime, additional_info) # Add runs to RunHistory idx = 0 for result in run_results: validated_rh.add(config=runs[idx].config, cost=result[1], time=result[2], status=result[0], instance_id=runs[idx].inst, seed=runs[idx].seed, additional_info=result[3]) idx += 1 if output_fn: self._save_results(validated_rh, output_fn, backup_fn="validated_runhistory.json") return validated_rh def _validate_parallel(self, tae: ExecuteTARun, runs: typing.List[_Run], n_jobs: int, backend: str): """ Validate runs with joblibs Parallel-interface Parameters ---------- tae: ExecuteTARun tae to be used for validation runs: list<_Run> list with _Run-objects [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1), ...] n_jobs: int number of cpus to use for validation (-1 to use all) backend: str what backend to use for parallelization Returns ------- run_results: list<tuple(tae-returns)> results as returned by tae """ # Runs with parallel run_results = Parallel(n_jobs=n_jobs, backend=backend)( delayed(_unbound_tae_starter)(tae, run.config, run.inst, self.scen.cutoff, run.seed, run.inst_specs, capped=False) for run in runs) return run_results def validate_epm( self, config_mode: Union[str, typing.List[Configuration]] = 'def', instance_mode: Union[str, typing.List[str]] = 'test', repetitions: int = 1, runhistory: RunHistory = None, output_fn="", reuse_epm=True, ) -> RunHistory: """ Use EPM to predict costs/runtimes for unknown config/inst-pairs. side effect: if output is specified, saves runhistory to specified output directory. Parameters ---------- output_fn: str path to runhistory to be saved. if the suffix is not '.json', will be interpreted as directory and filename will be 'validated_runhistory_EPM.json' config_mode: str or list<Configuration> string or directly a list of Configuration, string from [def, inc, def+inc, wallclock_time, cpu_time, all]. time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time instance_mode: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of repetitions in nondeterministic algorithms runhistory: RunHistory optional, RunHistory-object to reuse runs reuse_epm: bool if true (and if `self.epm`), reuse epm to validate runs Returns ------- runhistory: RunHistory runhistory with predicted runs """ if not isinstance(runhistory, RunHistory) and (self.epm is None or reuse_epm is False): raise ValueError( "No runhistory specified for validating with EPM!") elif reuse_epm is False or self.epm is None: # Create RandomForest types, bounds = get_types(self.scen.cs, self.scen.feature_array) self.epm = RandomForestWithInstances( types=types, bounds=bounds, instance_features=self.scen.feature_array, seed=self.rng.randint(MAXINT), ratio_features=1.0) # Use imputor if objective is runtime imputor = None impute_state = None impute_censored_data = False if self.scen.run_obj == 'runtime': threshold = self.scen.cutoff * self.scen.par_factor imputor = RFRImputator(rng=self.rng, cutoff=self.scen.cutoff, threshold=threshold, model=self.epm) impute_censored_data = True impute_state = [StatusType.CAPPED] # Transform training data (from given rh) rh2epm = RunHistory2EPM4Cost( num_params=len(self.scen.cs.get_hyperparameters()), scenario=self.scen, rng=self.rng, impute_censored_data=impute_censored_data, imputor=imputor, impute_state=impute_state) X, y = rh2epm.transform(runhistory) self.logger.debug("Training model with data of shape X: %s, y:%s", str(X.shape), str(y.shape)) # Train random forest self.epm.train(X, y) # Predict desired runs runs, rh_epm = self._get_runs(config_mode, instance_mode, repetitions, runhistory) feature_array_size = len(self.scen.cs.get_hyperparameters()) if self.scen.feature_array is not None: feature_array_size += self.scen.feature_array.shape[1] X_pred = np.empty((len(runs), feature_array_size)) for idx, run in enumerate(runs): if self.scen.feature_array is not None and run.inst is not None: X_pred[idx] = np.hstack([ convert_configurations_to_array([run.config])[0], self.scen.feature_dict[run.inst] ]) else: X_pred[idx] = convert_configurations_to_array([run.config])[0] self.logger.debug("Predicting desired %d runs, data has shape %s", len(runs), str(X_pred.shape)) y_pred = self.epm.predict(X_pred) # Add runs to runhistory for run, pred in zip(runs, y_pred[0]): rh_epm.add( config=run.config, cost=float(pred), time=float(pred), status=StatusType.SUCCESS, instance_id=run.inst, seed=-1, additional_info={"additional_info": "ESTIMATED USING EPM!"}) if output_fn: self._save_results(rh_epm, output_fn, backup_fn="validated_runhistory_EPM.json") return rh_epm def _get_runs( self, configs: Union[str, typing.List[Configuration]], insts: Union[str, typing.List[str]], repetitions: int = 1, runhistory: RunHistory = None, ) -> typing.Tuple[typing.List[_Run], RunHistory]: """ Generate list of SMAC-TAE runs to be executed. This means combinations of configs with all instances on a certain number of seeds. side effect: Adds runs that don't need to be reevaluated to self.rh! Parameters ---------- configs: str or list<Configuration> string or directly a list of Configuration str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time insts: str or list<str> what instances to use for validation, either from [train, test, train+test] or directly a list of instances repetitions: int number of seeds per instance/config-pair to be evaluated runhistory: RunHistory optional, try to reuse this runhistory and save some runs Returns ------- runs: list<_Run> list with _Runs [_Run(config=CONFIG1,inst=INSTANCE1,seed=SEED1,inst_specs=INST_SPECIFICS1), _Run(config=CONFIG2,inst=INSTANCE2,seed=SEED2,inst_specs=INST_SPECIFICS2), ...] """ # Get relevant configurations and instances if isinstance(configs, str): configs = self._get_configs(configs) if isinstance(insts, str): insts = self._get_instances(insts) # If no instances are given, fix the instances to one "None" instance if not insts: insts = [None] # If algorithm is deterministic, fix repetitions to 1 if self.scen.deterministic and repetitions != 1: self.logger.warning( "Specified %d repetitions, but fixing to 1, " "because algorithm is deterministic.", repetitions) repetitions = 1 # Extract relevant information from given runhistory inst_seed_config = self._process_runhistory(configs, insts, runhistory) # Now create the actual run-list runs = [] # Counter for runs without the need of recalculation runs_from_rh = 0 # If we reuse runs, we want to return them as well new_rh = RunHistory(average_cost) for i in sorted(insts): for rep in range(repetitions): # First, find a seed and add all the data we can take from the # given runhistory to "our" validation runhistory. configs_evaluated = [] if runhistory and i in inst_seed_config: # Choose seed based on most often evaluated inst-seed-pair seed, configs_evaluated = inst_seed_config[i].pop(0) # Delete inst if all seeds are used if not inst_seed_config[i]: inst_seed_config.pop(i) # Add runs to runhistory for c in configs_evaluated[:]: runkey = RunKey(runhistory.config_ids[c], i, seed) cost, time, status, additional_info = runhistory.data[ runkey] if status in [ StatusType.CRASHED, StatusType.ABORT, StatusType.CAPPED ]: # Not properly executed target algorithm runs should be repeated configs_evaluated.remove(c) continue new_rh.add(c, cost, time, status, instance_id=i, seed=seed, additional_info=additional_info) runs_from_rh += 1 else: # If no runhistory or no entries for instance, get new seed seed = self.rng.randint(MAXINT) # We now have a seed and add all configs that are not already # evaluated on that seed to the runs-list. This way, we # guarantee the same inst-seed-pairs for all configs. for config in [ c for c in configs if not c in configs_evaluated ]: # Only use specifics if specific exists, else use string "0" specs = self.scen.instance_specific[ i] if i and i in self.scen.instance_specific else "0" runs.append( _Run(config=config, inst=i, seed=seed, inst_specs=specs)) self.logger.info( "Collected %d runs from %d configurations on %d " "instances with %d repetitions. Reusing %d runs from " "given runhistory.", len(runs), len(configs), len(insts), repetitions, runs_from_rh) return runs, new_rh def _process_runhistory(self, configs: typing.List[Configuration], insts: typing.List[str], runhistory: RunHistory): """ Processes runhistory from self._get_runs by extracting already evaluated (relevant) config-inst-seed tuples. Parameters ---------- configs: list(Configuration) list of configs of interest insts: list(str) list of instances of interest runhistory: RunHistory runhistory to extract runs from Returns ------- inst_seed_config: dict<str : list(tuple(int, tuple(configs)))> dictionary mapping instances to a list of tuples of already used seeds and the configs that this inst-seed-pair has been evaluated on, sorted by the number of configs """ # We want to reuse seeds that have been used on most configurations # To this end, we create a dictionary as {instances:{seed:[configs]}} # Like this we can easily retrieve the most used instance-seed pairs to # minimize the number of runs to be evaluated inst_seed_config = {} if runhistory: relevant = dict() for key in runhistory.data: if (runhistory.ids_config[key.config_id] in configs and key.instance_id in insts): relevant[key] = runhistory.data[key] # Change data-structure to {instances:[(seed1, (configs)), (seed2, (configs), ... ]} # to make most used seed easily accessible, we sort after length of configs for key in relevant: inst, seed = key.instance_id, key.seed config = runhistory.ids_config[key.config_id] if inst in inst_seed_config: if seed in inst_seed_config[inst]: inst_seed_config[inst][seed].append(config) else: inst_seed_config[inst][seed] = [config] else: inst_seed_config[inst] = {seed: [config]} inst_seed_config = { i: sorted([(seed, list(inst_seed_config[i][seed])) for seed in inst_seed_config[i]], key=lambda x: len(x[1])) for i in inst_seed_config } return inst_seed_config def _get_configs(self, mode: str) -> typing.List[str]: """ Return desired configs Parameters ---------- mode: str str from [def, inc, def+inc, wallclock_time, cpu_time, all] time evaluates at cpu- or wallclock-timesteps of: [max_time/2^0, max_time/2^1, max_time/2^3, ..., default] with max_time being the highest recorded time Returns ------- configs: list<Configuration> list with desired configurations """ # Add desired configs configs = [] mode = mode.lower() if mode not in [ 'def', 'inc', 'def+inc', 'wallclock_time', 'cpu_time', 'all' ]: raise ValueError( "%s not a valid option for config_mode in validation." % mode) if mode == "def" or mode == "def+inc": configs.append(self.scen.cs.get_default_configuration()) if mode == "inc" or mode == "def+inc": configs.append(self.traj[-1]["incumbent"]) if mode in ["wallclock_time", "cpu_time"]: # get highest time-entry and add entries from there # not using wallclock_limit in case it's inf if (mode == "wallclock_time" and np.isfinite(self.scen.wallclock_limit)): max_time = self.scen.wallclock_limit elif (mode == "cpu_time" and np.isfinite(self.scen.algo_runs_timelimit)): max_time = self.scen.algo_runs_timelimit else: max_time = self.traj[-1][mode] counter = 2**0 for entry in self.traj[::-1]: if (entry[mode] <= max_time / counter and entry["incumbent"] not in configs): configs.append(entry["incumbent"]) counter *= 2 if not self.traj[0]["incumbent"] in configs: configs.append(traj[0]["incumbent"]) # add first if mode == "all": for entry in self.traj: if not entry["incumbent"] in configs: configs.append(entry["incumbent"]) self.logger.debug("Gathered %d configurations for mode %s.", len(configs), mode) return configs def _get_instances(self, mode: str) -> typing.List[str]: """ Get desired instances Parameters ---------- mode: str what instances to use for validation, from [train, test, train+test] Returns ------- instances: list<str> instances to be used """ instance_mode = mode.lower() if mode not in ['train', 'test', 'train+test']: raise ValueError( "%s not a valid option for instance_mode in validation." % mode) # Make sure if instances matter, than instances should be passed if ((instance_mode == 'train' and self.scen.train_insts == [None]) or (instance_mode == 'test' and self.scen.test_insts == [None])): self.logger.warning( "Instance mode is set to %s, but there are no " "%s-instances specified in the scenario. Setting instance mode to" "\"train+test\"!", instance_mode, instance_mode) instance_mode = 'train+test' instances = [] if ((instance_mode == 'train' or instance_mode == 'train+test') and not self.scen.train_insts == [None]): instances.extend(self.scen.train_insts) if ((instance_mode == 'test' or instance_mode == 'train+test') and not self.scen.test_insts == [None]): instances.extend(self.scen.test_insts) return instances
def _get_mean_var_time(self, validator, traj, use_epm, rh): """ Parameters ---------- validator: Validator validator (smac-based) traj: List[Configuraton] trajectory to set in validator use_epm: bool validated or not (no need to use epm if validated) rh: RunHistory ?? Returns ------- mean, var times: List[float] times to plot (x-values) configs """ # TODO kinda important: docstrings, what is this function doing? if validator: validator.traj = traj # set trajectory time, configs = [], [] if use_epm and not self.block_epm: for entry in traj: time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) # self.logger.debug('Time: %d Runs: %d', time[-1], len(rh.get_runs_for_config(configs[-1]))) self.logger.debug( "Using %d samples (%d distinct) from trajectory.", len(time), len(set(configs))) # Initialize EPM if validator.epm: # not log as validator epm is trained on cost, not log cost epm = validator.epm else: self.logger.debug( "No EPM passed! Training new one from runhistory.") # Train random forest and transform training data (from given rh) # Not using validator because we want to plot uncertainties rh2epm = RunHistory2EPM4Cost(num_params=len( self.scenario.cs.get_hyperparameters()), scenario=self.scenario) X, y = rh2epm.transform(rh) self.logger.debug( "Training model with data of shape X: %s, y: %s", str(X.shape), str(y.shape)) types, bounds = get_types(self.scenario.cs, self.scenario.feature_array) epm = RandomForestWithInstances( self.scenario.cs, types=types, bounds=bounds, seed=self.rng.randint(MAXINT), instance_features=self.scenario.feature_array, ratio_features=1.0) epm.train(X, y) config_array = convert_configurations_to_array(configs) mean, var = epm.predict_marginalized_over_instances(config_array) var = np.zeros(mean.shape) # We don't want to show the uncertainty of the model but uncertainty over multiple optimizer runs # This variance is computed in an outer loop. else: mean, var = [], [] for entry in traj: #self.logger.debug(entry) time.append(entry["wallclock_time"]) configs.append(entry["incumbent"]) costs = _cost(configs[-1], rh, rh.get_runs_for_config(configs[-1])) # self.logger.debug(len(costs), time[-1] if not costs: time.pop() else: mean.append(np.mean(costs)) var.append(0) # No variance over instances mean, var = np.array(mean).reshape(-1, 1), np.array(var).reshape( -1, 1) return mean, var, time, configs
best_indices = np.argsort(meta_predictions)[-250:][::-1] observed_y = y[test_index][best_indices] else: # Do Blended BO for 250 iterations observed_X = [] observed_y = [] observed_i = [] surpassed = None for iteration in range(0, 250): # We need to have observed at least 3 items for the model to be able to predict surr_predictions = np.zeros_like(test_index) if iteration > 2 and alpha < 1: surr_estimator.train( np.array(observed_X).astype(float), np.array(observed_y)) mu, var = surr_estimator.predict( np.array(surr_X.iloc[test_index]).astype(float)) mu = mu.reshape(-1) var = var.reshape(-1) sigma = np.sqrt(var) diff = mu - np.max(observed_y) Z = diff / sigma ei = diff * norm.cdf(Z) + sigma * norm.pdf(Z) surr_predictions = ei # surr_predictions = surr_estimator.predict(np.array(surr_X.iloc[test_index]).astype(float)) # print(iteration, "\t", np.std(surr_predictions), "\t", np.std(meta_predictions)) m_corr, m_pvalue = kendalltau(meta_predictions, y[test_index])
def get_pred_surface(self, rh, X_scaled, conf_list: list, contour_step_size): """fit epm on the scaled input dimension and return data to plot a contour plot of the empirical performance Parameters ---------- rh: RunHistory runhistory X_scaled: np.array configurations in scaled 2dim conf_list: list list of Configuration objects contour_step_size: float step-size for contour Returns ------- contour_data: (np.array, np.array, np.array) x, y, Z for contour plots """ # use PCA to reduce features to also at most 2 dims scen = copy.deepcopy(self.scenario) # pca changes feats if scen.feature_array.shape[1] > 2: self.logger.debug( "Use PCA to reduce features to from %d dim to 2 dim", scen.feature_array.shape[1]) # perform PCA insts = scen.feature_dict.keys() feature_array = np.array([scen.feature_dict[i] for i in insts]) feature_array = StandardScaler().fit_transform(feature_array) feature_array = PCA(n_components=2).fit_transform(feature_array) # inject in scenario-object scen.feature_array = feature_array scen.feature_dict = dict([(inst, feature_array[idx, :]) for idx, inst in enumerate(insts)]) scen.n_features = 2 # convert the data to train EPM on 2-dim featurespace (for contour-data) self.logger.debug("Convert data for epm.") X, y, types = convert_data_for_epm(scenario=scen, runhistory=rh, impute_inactive_parameters=True, logger=self.logger) types = np.array(np.zeros((2 + scen.feature_array.shape[1])), dtype=np.uint) num_params = len(scen.cs.get_hyperparameters()) # impute missing values in configs and insert MDS'ed (2dim) configs to the right positions conf_dict = {} # Remove forbidden clauses (this is necessary to enable the impute_inactive_values-method, see #226) cs_no_forbidden = copy.deepcopy(conf_list[0].configuration_space) cs_no_forbidden.forbidden_clauses = [] for idx, c in enumerate(conf_list): c.configuration_space = cs_no_forbidden conf_list[idx] = impute_inactive_values(c) conf_dict[str(conf_list[idx].get_array())] = X_scaled[idx, :] # Debug compare elements: c1, c2 = {str(z) for z in X}, {str(z) for z in conf_dict.keys()} self.logger.debug( "{} elements not in both sets, {} elements in both sets, X (len {}) and conf_dict (len {}) " "(might be a problem related to forbidden clauses?)".format( len(c1 ^ c2), len(c1 & c2), len(c1 ^ c2), len(c1), len(c2))) # self.logger.debug("Elements: {}".format(str(c1 ^ c2))) X_trans = [ ] # X_trans is the same as X but with reduced 2-dim features (so shape is (N, 2) instead of (N, M)) for x in X: x_scaled_conf = conf_dict[str(x[:num_params])] # append scaled config + pca'ed features (total of 4 values) per config/feature-sample X_trans.append( np.concatenate((x_scaled_conf, x[num_params:]), axis=0)) X_trans = np.array(X_trans) self.logger.debug( "Train random forest for contour-plot. Shape of X: {}, shape of X_trans: {}" .format(X.shape, X_trans.shape)) self.logger.debug("Faking configspace to be able to train rf...") # We need to fake config-space bypass imputation of inactive values in random forest implementation fake_cs = ConfigurationSpace(name="fake-cs-for-configurator-footprint") bounds = np.array([(0, np.nan), (0, np.nan)], dtype=object) model = RandomForestWithInstances(fake_cs, types, bounds, seed=self.rng.randint(MAXINT), instance_features=np.array( scen.feature_array), ratio_features=1.0) start = time.time() model.train(X_trans, y) self.logger.debug("Fitting random forest took %f time", time.time() - start) x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1 y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, contour_step_size), np.arange(y_min, y_max, contour_step_size)) self.logger.debug("x_min: %f, x_max: %f, y_min: %f, y_max: %f", x_min, x_max, y_min, y_max) self.logger.debug( "Predict on %d samples in grid to get surface (step-size: %f)", np.c_[xx.ravel(), yy.ravel()].shape[0], contour_step_size) start = time.time() Z, _ = model.predict_marginalized_over_instances(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) self.logger.debug("Predicting random forest took %f time", time.time() - start) return xx, yy, Z